src/Data.py

   1 """
   2 Classes for working with (downloading, importing) the online census
   3 data.
   4 """
   5
   6 import os
   7 import urllib
   8 import zipfile
   9
  10 import FileUtils
  11
  12
  13 class State:
  14     """
  15     A state contains zero or more counties and cities. Each state has
  16     its own ID, as well as its own directory on the server.
  17
  18     Example:
  19
  20     SF1:   http://www2.census.gov/census_2000/datasets/Summary_File_1/Maryland
  21     TIGER: http://www2.census.gov/geo/tiger/TIGER2009/24_MARYLAND
  22
  23     """
  24
  25     SF1_ROOT = 'http://www2.census.gov/census_2000/datasets/Summary_File_1'
  26     TIGER_ROOT = 'http://www2.census.gov/geo/tiger/TIGER2009'
  27
  28     def __init__(self, initial_id=None, initial_name=None, abbreviation = None):
  29         self.id = initial_id
  30         self.abbreviation = abbreviation
  31         self.name = initial_name
  32         self.counties = []
  33
  34
  35     def sf1_data_root(self):
  36         sdr =  self.SF1_ROOT + '/'
  37         sdr += self.name.replace(' ', '_')
  38         return sdr
  39
  40
  41     def tiger_data_root(self):
  42         tdr =  self.TIGER_ROOT + '/'
  43         tdr += str(self.id) + '_' + self.name.upper().replace(' ', '_')
  44         return tdr
  45
  46
  47     def sf1_data_url(self):
  48         sdu = self.sf1_data_root() + '/'
  49         sdu += self.sf1_zipfile_name()
  50         return sdu
  51
  52
  53     def blocks_data_url(self):
  54         bdu =  self.tiger_data_root() + '/'
  55         bdu += self.blocks_zipfile_name()
  56         return bdu
  57
  58
  59     def sf1_data_path(self):
  60         sdp = 'data/census2000/'
  61         sdp += self.name.lower().replace(' ', '_')
  62         sdp += '/sf1'
  63         return sdp
  64
  65
  66     def blocks_data_path(self):
  67         bdp =  'data/census2000/'
  68         bdp += self.name.lower().replace(' ', '_')
  69         bdp += '/blocks'
  70         return bdp
  71
  72
  73     def lines_data_path(self):
  74         ldp =  'data/census2000/'
  75         ldp += self.name.lower().replace(' ', '_')
  76         ldp += '/lines'
  77         return ldp
  78
  79
  80     def sf1_zipfile_name(self):
  81         return self.abbreviation.lower() + 'geo_uf1.zip'
  82
  83
  84     def blocks_zipfile_name(self):
  85         return 'tl_2009_' + str(self.id) + '_tabblock00.zip'
  86
  87
  88     def sf1_geo_file_path(self):
  89         sgfp = self.sf1_data_path() + '/'
  90         sgfp += self.abbreviation.lower() + 'geo.uf1'
  91         return sgfp
  92
  93
  94     def blocks_shapefile_path(self):
  95         bsp =  self.blocks_data_path() + '/'
  96         bsp += 'tl_2009_' + str(self.id) + '_tabblock00.shp'
  97         return bsp
  98
  99
 100     def add_county(self, county_id, county_name, override_name=False):
 101         """
 102         We would like each county to have a pointer to its containing
 103         state. This so we can compute the file URL, directory, and so
 104         forth from within the county.
 105         """
 106         self.counties.append(County(county_id,
 107                                     county_name,
 108                                     self,
 109                                     override_name))
 110
 111
 112
 113 class County:
 114     """
 115     A county represents either a county or city. It doesn't make
 116     sense, but 'county-level' data is given for certain cities which
 117     don't technically belong to any county.
 118     """
 119
 120     def __init__(self, initial_id=None,
 121                  initial_name=None,
 122                  initial_state=None,
 123                  override_name=False):
 124         """
 125         If this is a city, we should override our name with
 126         e.g. 'Baltimore city' so that full_name() doesn't transform
 127         'Baltimore' in to 'Baltmore County'.
 128         """
 129         self.id = initial_id
 130         self.name = initial_name
 131         self.state = initial_state
 132         self.override_name = override_name
 133
 134
 135     def state_county_id(self):
 136         return str(self.state.id) + ("%03d" % self.id)
 137
 138
 139     def full_name(self):
 140         """
 141         Some of the counties (e.g. Baltimore City, Washington D.C.),
 142         need to have their names overridden since they aren't
 143         technically counties, but are treated as such by the Census.
 144         """
 145         if (self.override_name == False):
 146             return self.name + ' County'
 147         else:
 148             # "Override name" basically means, "use the name I passed
 149             # you and don't add the word 'County' on to it."
 150             return self.name
 151
 152
 153     def lines_data_url(self):
 154         tdp =  self.state.tiger_data_root() + '/'
 155         tdp += self.state_county_id()
 156         tdp += '_' + self.full_name().replace(' ', '_') + '/'
 157         tdp += self.lines_zipfile_name()
 158         return tdp
 159
 160
 161     def lines_zipfile_name(self):
 162         return 'tl_2009_' + self.state_county_id() + '_edges.zip'
 163
 164
 165     def lines_shapefile_path(self):
 166         sfp =  self.state.lines_data_path() + '/'
 167         sfp += 'tl_2009_' + self.state_county_id() + '_edges.shp'
 168         return sfp
 169
 170
 171
 172 def download_sf1(states):
 173     """
 174     Download the Summary File 1 geo file for each state.
 175     """
 176
 177     for state in states:
 178         # First, create the blocks data path if it doesn't exist.
 179         FileUtils.mkdir_p(state.sf1_data_path(), 0755)
 180
 181         if not os.path.exists(state.sf1_geo_file_path()):
 182             url = state.sf1_data_url()
 183             tmpfile = state.sf1_zipfile_name()
 184             print "Grabbing SF1 data for %s." % state.name
 185             print "Downloading %s to %s..." % (url, tmpfile)
 186
 187             try:
 188                 # This can fail for a bunch of reasons...
 189                 urllib.urlretrieve(url, tmpfile)
 190                 print "Unzipping %s to %s..." % (tmpfile, state.sf1_data_path())
 191                 z = zipfile.ZipFile(tmpfile)
 192                 z.extractall(state.sf1_data_path())
 193             except:
 194                 # That we don't care about.
 195                 pass
 196             finally:
 197                 # But we always clean up after ourselves.
 198                 print "Removing %s..." % tmpfile
 199                 FileUtils.rm_f(tmpfile)
 200                 print "Done.\n"
 201
 202
 203
 204 def download_blocks(states):
 205     """
 206     Download the TIGER/Line block files for each state.
 207     """
 208
 209     for state in states:
 210         # First, create the blocks data path if it doesn't exist.
 211         FileUtils.mkdir_p(state.blocks_data_path(), 0755)
 212
 213         if not os.path.exists(state.blocks_shapefile_path()):
 214             url = state.blocks_data_url()
 215             tmpfile = state.blocks_zipfile_name()
 216             print "Grabbing TIGER blocks data for %s." % state.name
 217             print "Downloading %s to %s..." % (url, tmpfile)
 218
 219             try:
 220                 # This can fail for a bunch of reasons...
 221                 urllib.urlretrieve(url, tmpfile)
 222                 print "Unzipping %s to %s..." % (tmpfile, state.blocks_data_path())
 223                 z = zipfile.ZipFile(tmpfile)
 224                 z.extractall(state.blocks_data_path())
 225             except:
 226                 # That we don't care about.
 227                 pass
 228             finally:
 229                 # But we always clean up after ourselves.
 230                 print "Removing %s..." % tmpfile
 231                 FileUtils.rm_f(tmpfile)
 232                 print "Done.\n"
 233
 234
 235
 236 def download_lines(states):
 237     """
 238     Download the TIGER/Line 'all lines' files for each county in states.
 239     """
 240
 241     for state in states:
 242         # First, create the lines data path if it doesn't exist.
 243         FileUtils.mkdir_p(state.lines_data_path(), 0755)
 244
 245         # Now loop through the counties, and download/unzip the lines
 246         # data if necessary.
 247         for county in state.counties:
 248             if not os.path.exists(county.lines_shapefile_path()):
 249                 url = county.lines_data_url()
 250                 tmpfile = county.lines_zipfile_name()
 251                 print "Grabbing TIGER lines data for %s (%s)." % (county.full_name(), state.name)
 252                 print "Downloading %s to %s..." % (url, tmpfile)
 253
 254                 try:
 255                     # This can fail for a bunch of reasons...
 256                     urllib.urlretrieve(url, tmpfile)
 257                     print "Unzipping %s to %s..." % (tmpfile, state.lines_data_path())
 258                     z = zipfile.ZipFile(tmpfile)
 259                     z.extractall(state.lines_data_path())
 260                 except:
 261                     # That we don't care about.
 262                     pass
 263                 finally:
 264                      # But we always clean up after ourselves.
 265                     print "Removing %s..." % tmpfile
 266                     FileUtils.rm_f(tmpfile)
 267                     print "Done.\n"