src/Data.py

   1 """
   2 Classes for working with (downloading, importing) the online census
   3 data.
   4 """
   5
   6 import os
   7 import urllib
   8 import zipfile
   9
  10 import FileUtils
  11
  12
  13 class State:
  14     """
  15     A state contains zero or more counties and cities. Each state has
  16     its own ID, as well as its own directory on the server.
  17
  18     Example:
  19
  20     http://www2.census.gov/geo/tiger/TIGER2009/24_MARYLAND
  21
  22     """
  23
  24     TIGER_ROOT = 'http://www2.census.gov/geo/tiger/TIGER2009'
  25
  26     def __init__(self, initial_id=None, initial_name=None):
  27         self.id = initial_id
  28         self.name = initial_name
  29         self.counties = []
  30
  31
  32     def tiger_data_root(self):
  33         tdr =  self.TIGER_ROOT + '/'
  34         tdr += str(self.id) + '_' + self.name.upper().replace(' ', '_')
  35         return tdr
  36
  37
  38     def blocks_data_url(self):
  39         bdu =  self.tiger_data_root() + '/'
  40         bdu += self.blocks_zipfile_name()
  41         return bdu
  42
  43
  44     def blocks_data_path(self):
  45         bdp =  'data/census2000/'
  46         bdp += self.name.lower().replace(' ', '_')
  47         bdp += '/blocks'
  48         return bdp
  49
  50
  51     def lines_data_path(self):
  52         ldp =  'data/census2000/'
  53         ldp += self.name.lower().replace(' ', '_')
  54         ldp += '/lines'
  55         return ldp
  56
  57
  58     def blocks_zipfile_name(self):
  59         return 'tl_2009_' + str(self.id) + '_tabblock00.zip'
  60
  61
  62     def blocks_shapefile_path(self):
  63         bsp =  self.blocks_data_path() + '/'
  64         bsp += 'tl_2009_' + str(self.id) + '_tabblock00.shp'
  65         return bsp
  66
  67
  68     def add_county(self, county_id, county_name, override_name=False):
  69         """
  70         We would like each county to have a pointer to its containing
  71         state. This so we can compute the file URL, directory, and so
  72         forth from within the county.
  73         """
  74         self.counties.append(County(county_id,
  75                                     county_name,
  76                                     self,
  77                                     override_name))
  78
  79
  80
  81 class County:
  82     """
  83     A county represents either a county or city. It doesn't make
  84     sense, but 'county-level' data is given for certain cities which
  85     don't technically belong to any county.
  86     """
  87
  88     def __init__(self, initial_id=None,
  89                  initial_name=None,
  90                  initial_state=None,
  91                  override_name=False):
  92         """
  93         If this is a city, we should override our name with
  94         e.g. 'Baltimore city' so that full_name() doesn't transform
  95         'Baltimore' in to 'Baltmore County'.
  96         """
  97         self.id = initial_id
  98         self.name = initial_name
  99         self.state = initial_state
 100         self.override_name = override_name
 101
 102
 103     def state_county_id(self):
 104         return str(self.state.id) + ("%03d" % self.id)
 105
 106
 107     def full_name(self):
 108         """
 109         Some of the counties (e.g. Baltimore City, Washington D.C.),
 110         need to have their names overridden since they aren't
 111         technically counties, but are treated as such by the Census.
 112         """
 113         if (self.override_name == False):
 114             return self.name + ' County'
 115         else:
 116             # "Override name" basically means, "use the name I passed
 117             # you and don't add the word 'County' on to it."
 118             return self.name
 119
 120
 121     def lines_data_url(self):
 122         tdp =  self.state.tiger_data_root() + '/'
 123         tdp += self.state_county_id()
 124         tdp += '_' + self.full_name().replace(' ', '_') + '/'
 125         tdp += self.lines_zipfile_name()
 126         return tdp
 127
 128
 129     def lines_zipfile_name(self):
 130         return 'tl_2009_' + self.state_county_id() + '_edges.zip'
 131
 132
 133     def lines_shapefile_path(self):
 134         sfp =  self.state.lines_data_path() + '/'
 135         sfp += 'tl_2009_' + self.state_county_id() + '_edges.shp'
 136         return sfp
 137
 138
 139
 140 def download_blocks(states):
 141     """
 142     Download the TIGER/Line block files for each state.
 143     """
 144
 145     for state in states:
 146         # First, create the blocks data path if it doesn't exist.
 147         FileUtils.mkdir_p(state.blocks_data_path(), 0755)
 148
 149         if not os.path.exists(state.blocks_shapefile_path()):
 150             url = state.blocks_data_url()
 151             tmpfile = state.blocks_zipfile_name()
 152             print "Grabbing data for %s." % state.name
 153             print "Downloading %s to %s..." % (url, tmpfile)
 154
 155             try:
 156                 # This can fail for a bunch of reasons...
 157                 urllib.urlretrieve(url, tmpfile)
 158                 print "Unzipping %s to %s..." % (tmpfile, state.blocks_data_path())
 159                 z = zipfile.ZipFile(tmpfile)
 160                 z.extractall(state.blocks_data_path())
 161             except:
 162                 # That we don't care about.
 163                 pass
 164             finally:
 165                 # But we always clean up after ourselves.
 166                 print "Removing %s..." % tmpfile
 167                 FileUtils.rm_f(tmpfile)
 168                 print "Done.\n"
 169
 170
 171
 172 def download_lines(states):
 173     """
 174     Download the TIGER/Line 'all lines' files for each county in states.
 175     """
 176
 177     for state in states:
 178         # First, create the lines data path if it doesn't exist.
 179         FileUtils.mkdir_p(state.lines_data_path(), 0755)
 180
 181         # Now loop through the counties, and download/unzip the lines
 182         # data if necessary.
 183         for county in state.counties:
 184             if not os.path.exists(county.lines_shapefile_path()):
 185                 url = county.tiger_data_url()
 186                 tmpfile = county.lines_zipfile_name()
 187                 print "Grabbing data for %s (%s)." % (county.full_name(), state.name)
 188                 print "Downloading %s to %s..." % (url, tmpfile)
 189
 190                 try:
 191                     # This can fail for a bunch of reasons...
 192                     urllib.urlretrieve(url, tmpfile)
 193                     print "Unzipping %s to %s..." % (tmpfile, state.lines_data_path())
 194                     z = zipfile.ZipFile(tmpfile)
 195                     z.extractall(state.lines_data_path())
 196                 except:
 197                     # That we don't care about.
 198                     pass
 199                 finally:
 200                      # But we always clean up after ourselves.
 201                     print "Removing %s..." % tmpfile
 202                     FileUtils.rm_f(tmpfile)
 203                     print "Done.\n"