src/Data.py

   1 """
   2 Classes for working with (downloading, importing) the online census
   3 data.
   4 """
   5
   6 import os
   7 import urllib
   8 import zipfile
   9
  10 import FileUtils
  11
  12
  13 class State:
  14     """
  15     A state contains zero or more counties and cities. Each state has
  16     its own ID, as well as its own directory on the server.
  17
  18     Example:
  19
  20     http://www2.census.gov/geo/tiger/TIGER2009/24_MARYLAND
  21
  22     """
  23
  24     TIGER_ROOT = 'http://www2.census.gov/geo/tiger/TIGER2009'
  25
  26     def __init__(self, initial_id=None, initial_name=None):
  27         self.id = initial_id
  28         self.name = initial_name
  29         self.counties = []
  30
  31
  32     def tiger_data_url(self):
  33         tdu =  self.TIGER_ROOT + '/'
  34         tdu += str(self.id) + '_' + self.name.upper().replace(' ', '_')
  35         return tdu
  36
  37
  38     def lines_data_path(self):
  39         ldp =  'data/census2000/'
  40         ldp += self.name.lower().replace(' ', '_')
  41         ldp += '/lines'
  42         return ldp
  43
  44
  45     def add_county(self, county_id, county_name, override_name=False):
  46         """
  47         We would like each county to have a pointer to its containing
  48         state. This so we can compute the file URL, directory, and so
  49         forth from within the county.
  50         """
  51         self.counties.append(County(county_id, county_name, self, override_name))
  52
  53
  54
  55 class County:
  56     """
  57     A county represents either a county or city. It doesn't make
  58     sense, but 'county-level' data is given for certain cities which
  59     don't technically belong to any county.
  60     """
  61
  62     def __init__(self, initial_id=None,
  63                  initial_name=None,
  64                  initial_state=None,
  65                  override_name=False):
  66         """
  67         If this is a city, we should override our name with
  68         e.g. 'Baltimore city' so that full_name() doesn't transform
  69         'Baltimore' in to 'Baltmore County'.
  70         """
  71         self.id = initial_id
  72         self.name = initial_name
  73         self.state = initial_state
  74         self.override_name = override_name
  75
  76
  77     def state_county_id(self):
  78         return str(self.state.id) + ("%03d" % self.id)
  79
  80
  81     def full_name(self):
  82         """
  83         Some of the counties (e.g. Baltimore City, Washington D.C.),
  84         need to have their names overridden since they aren't
  85         technically counties, but are treated as such by the Census.
  86         """
  87         if (self.override_name == False):
  88             return self.name + ' County'
  89         else:
  90             # "Override name" basically means, "use the name I passed
  91             # you and don't add the word 'County' on to it."
  92             return self.name
  93
  94
  95     def tiger_data_url(self):
  96         tdp =  self.state.tiger_data_url() + '/'
  97         tdp += self.state_county_id()
  98         tdp += '_' + self.full_name().replace(' ', '_') + '/'
  99         tdp += self.zipfile_name()
 100         return tdp
 101
 102
 103     def zipfile_name(self):
 104         return 'tl_2009_' + self.state_county_id() + '_edges.zip'
 105
 106
 107     def shapefile_path(self):
 108         sfp =  self.state.lines_data_path() + '/'
 109         sfp += 'tl_2009_' + self.state_county_id() + '_edges.shp'
 110         return sfp
 111
 112
 113 def download_lines(states):
 114     """
 115     Download the TIGER/Line 'all lines' files for each county in states.
 116     """
 117
 118     for state in states:
 119         # First, create the lines data path if it doesn't exist.
 120         FileUtils.mkdir_p(state.lines_data_path(), 0755)
 121
 122         # Now loop through the counties, and download/unzip the lines
 123         # data if necessary.
 124         for county in state.counties:
 125             if not os.path.exists(county.shapefile_path()):
 126                 url = county.tiger_data_url()
 127                 tmpfile = county.zipfile_name()
 128                 print "Grabbing data for %s (%s)." % (county.full_name(), state.name)
 129                 print "Downloading %s to %s..." % (url, tmpfile)
 130
 131                 try:
 132                     # This can fail for a bunch of reasons...
 133                     urllib.urlretrieve(url, tmpfile)
 134                     print "Unzipping %s to %s..." % (tmpfile, state.lines_data_path())
 135                     z = zipfile.ZipFile(tmpfile)
 136                     z.extractall(state.lines_data_path())
 137                 except:
 138                     # That we don't care about.
 139                     pass
 140                 finally:
 141                     # But we always clean up after ourselves.
 142                     print "Removing %s..." % tmpfile
 143                     FileUtils.rm_f(tmpfile)
 144                     print "Done.\n"