From a6cd5954135c79ba6057a933f38b961e4e55c2d2 Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Wed, 7 Oct 2009 11:02:57 -0400 Subject: [PATCH] Added the Data module which contains the classes and functions used to download the online census data files. --- src/Data.py | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 src/Data.py diff --git a/src/Data.py b/src/Data.py new file mode 100644 index 0000000..a8d962d --- /dev/null +++ b/src/Data.py @@ -0,0 +1,144 @@ +""" +Classes for working with (downloading, importing) the online census +data. +""" + +import os +import urllib +import zipfile + +import FileUtils + + +class State: + """ + A state contains zero or more counties and cities. Each state has + its own ID, as well as its own directory on the server. + + Example: + + http://www2.census.gov/geo/tiger/TIGER2009/24_MARYLAND + + """ + + TIGER_ROOT = 'http://www2.census.gov/geo/tiger/TIGER2009' + + def __init__(self, initial_id=None, initial_name=None): + self.id = initial_id + self.name = initial_name + self.counties = [] + + + def tiger_data_url(self): + tdu = self.TIGER_ROOT + '/' + tdu += str(self.id) + '_' + self.name.upper().replace(' ', '_') + return tdu + + + def lines_data_path(self): + ldp = 'data/census2000/' + ldp += self.name.lower().replace(' ', '_') + ldp += '/lines' + return ldp + + + def add_county(self, county_id, county_name, override_name=False): + """ + We would like each county to have a pointer to its containing + state. This so we can compute the file URL, directory, and so + forth from within the county. + """ + self.counties.append(County(county_id, county_name, self, override_name)) + + + +class County: + """ + A county represents either a county or city. It doesn't make + sense, but 'county-level' data is given for certain cities which + don't technically belong to any county. + """ + + def __init__(self, initial_id=None, + initial_name=None, + initial_state=None, + override_name=False): + """ + If this is a city, we should override our name with + e.g. 'Baltimore city' so that full_name() doesn't transform + 'Baltimore' in to 'Baltmore County'. + """ + self.id = initial_id + self.name = initial_name + self.state = initial_state + self.override_name = override_name + + + def state_county_id(self): + return str(self.state.id) + ("%03d" % self.id) + + + def full_name(self): + """ + Some of the counties (e.g. Baltimore City, Washington D.C.), + need to have their names overridden since they aren't + technically counties, but are treated as such by the Census. + """ + if (self.override_name == False): + return self.name + ' County' + else: + # "Override name" basically means, "use the name I passed + # you and don't add the word 'County' on to it." + return self.name + + + def tiger_data_url(self): + tdp = self.state.tiger_data_url() + '/' + tdp += self.state_county_id() + tdp += '_' + self.full_name().replace(' ', '_') + '/' + tdp += self.zipfile_name() + return tdp + + + def zipfile_name(self): + return 'tl_2009_' + self.state_county_id() + '_edges.zip' + + + def shapefile_path(self): + sfp = self.state.lines_data_path() + '/' + sfp += 'tl_2009_' + self.state_county_id() + '_edges.shp' + return sfp + + +def download_lines(states): + """ + Download the TIGER/Line 'all lines' files for each county in states. + """ + + for state in states: + # First, create the lines data path if it doesn't exist. + FileUtils.mkdir_p(state.lines_data_path(), 0755) + + # Now loop through the counties, and download/unzip the lines + # data if necessary. + for county in state.counties: + if not os.path.exists(county.shapefile_path()): + url = county.tiger_data_url() + tmpfile = county.zipfile_name() + print "Grabbing data for %s (%s)." % (county.full_name(), state.name) + print "Downloading %s to %s..." % (url, tmpfile) + + try: + # This can fail for a bunch of reasons... + urllib.urlretrieve(url, tmpfile) + print "Unzipping %s to %s..." % (tmpfile, state.lines_data_path()) + z = zipfile.ZipFile(tmpfile) + z.extractall(state.lines_data_path()) + except: + # That we don't care about. + pass + finally: + # But we always clean up after ourselves. + print "Removing %s..." % tmpfile + FileUtils.rm_f(tmpfile) + print "Done.\n" -- 2.44.2