""" Classes for working with (downloading, importing) the online census data. """ import os import urllib import zipfile import FileUtils class State: """ A state contains zero or more counties and cities. Each state has its own ID, as well as its own directory on the server. Example: http://www2.census.gov/geo/tiger/TIGER2009/24_MARYLAND """ TIGER_ROOT = 'http://www2.census.gov/geo/tiger/TIGER2009' def __init__(self, initial_id=None, initial_name=None): self.id = initial_id self.name = initial_name self.counties = [] def tiger_data_url(self): tdu = self.TIGER_ROOT + '/' tdu += str(self.id) + '_' + self.name.upper().replace(' ', '_') return tdu def lines_data_path(self): ldp = 'data/census2000/' ldp += self.name.lower().replace(' ', '_') ldp += '/lines' return ldp def add_county(self, county_id, county_name, override_name=False): """ We would like each county to have a pointer to its containing state. This so we can compute the file URL, directory, and so forth from within the county. """ self.counties.append(County(county_id, county_name, self, override_name)) class County: """ A county represents either a county or city. It doesn't make sense, but 'county-level' data is given for certain cities which don't technically belong to any county. """ def __init__(self, initial_id=None, initial_name=None, initial_state=None, override_name=False): """ If this is a city, we should override our name with e.g. 'Baltimore city' so that full_name() doesn't transform 'Baltimore' in to 'Baltmore County'. """ self.id = initial_id self.name = initial_name self.state = initial_state self.override_name = override_name def state_county_id(self): return str(self.state.id) + ("%03d" % self.id) def full_name(self): """ Some of the counties (e.g. Baltimore City, Washington D.C.), need to have their names overridden since they aren't technically counties, but are treated as such by the Census. """ if (self.override_name == False): return self.name + ' County' else: # "Override name" basically means, "use the name I passed # you and don't add the word 'County' on to it." return self.name def tiger_data_url(self): tdp = self.state.tiger_data_url() + '/' tdp += self.state_county_id() tdp += '_' + self.full_name().replace(' ', '_') + '/' tdp += self.zipfile_name() return tdp def zipfile_name(self): return 'tl_2009_' + self.state_county_id() + '_edges.zip' def shapefile_path(self): sfp = self.state.lines_data_path() + '/' sfp += 'tl_2009_' + self.state_county_id() + '_edges.shp' return sfp def download_lines(states): """ Download the TIGER/Line 'all lines' files for each county in states. """ for state in states: # First, create the lines data path if it doesn't exist. FileUtils.mkdir_p(state.lines_data_path(), 0755) # Now loop through the counties, and download/unzip the lines # data if necessary. for county in state.counties: if not os.path.exists(county.shapefile_path()): url = county.tiger_data_url() tmpfile = county.zipfile_name() print "Grabbing data for %s (%s)." % (county.full_name(), state.name) print "Downloading %s to %s..." % (url, tmpfile) try: # This can fail for a bunch of reasons... urllib.urlretrieve(url, tmpfile) print "Unzipping %s to %s..." % (tmpfile, state.lines_data_path()) z = zipfile.ZipFile(tmpfile) z.extractall(state.lines_data_path()) except: # That we don't care about. pass finally: # But we always clean up after ourselves. print "Removing %s..." % tmpfile FileUtils.rm_f(tmpfile) print "Done.\n"