Added the Data module which contains the classes and functions used to download the...

author Michael Orlitzky <michael@orlitzky.com>

Wed, 7 Oct 2009 15:02:57 +0000 (11:02 -0400)

committer Michael Orlitzky <michael@orlitzky.com>

Wed, 7 Oct 2009 15:02:57 +0000 (11:02 -0400)
author Michael Orlitzky <michael@orlitzky.com>
Wed, 7 Oct 2009 15:02:57 +0000 (11:02 -0400)
committer Michael Orlitzky <michael@orlitzky.com>
Wed, 7 Oct 2009 15:02:57 +0000 (11:02 -0400)
diff --git a/src/Data.py b/src/Data.py

new file mode 100644 (file)

index 0000000..a8d962d
--- /dev/null
+++ b/src/Data.py
@@ -0,0 +1,144 @@
+"""
+Classes for working with (downloading, importing) the online census
+data.
+"""
+
+import os
+import urllib
+import zipfile
+
+import FileUtils
+
+
+class State:
+    """
+    A state contains zero or more counties and cities. Each state has
+    its own ID, as well as its own directory on the server.
+
+    Example:
+
+    http://www2.census.gov/geo/tiger/TIGER2009/24_MARYLAND
+    
+    """
+
+    TIGER_ROOT = 'http://www2.census.gov/geo/tiger/TIGER2009'
+    
+    def __init__(self, initial_id=None, initial_name=None):
+        self.id = initial_id
+        self.name = initial_name
+        self.counties = []
+
+
+    def tiger_data_url(self):
+        tdu =  self.TIGER_ROOT + '/'
+        tdu += str(self.id) + '_' + self.name.upper().replace(' ', '_')
+        return tdu
+
+
+    def lines_data_path(self):
+        ldp =  'data/census2000/'
+        ldp += self.name.lower().replace(' ', '_')
+        ldp += '/lines'
+        return ldp
+
+
+    def add_county(self, county_id, county_name, override_name=False):
+        """
+        We would like each county to have a pointer to its containing
+        state. This so we can compute the file URL, directory, and so
+        forth from within the county.
+        """
+        self.counties.append(County(county_id, county_name, self, override_name))
+
+        
+    
+class County:
+    """
+    A county represents either a county or city. It doesn't make
+    sense, but 'county-level' data is given for certain cities which
+    don't technically belong to any county.    
+    """
+    
+    def __init__(self, initial_id=None,
+                 initial_name=None,
+                 initial_state=None,
+                 override_name=False):
+        """
+        If this is a city, we should override our name with
+        e.g. 'Baltimore city' so that full_name() doesn't transform
+        'Baltimore' in to 'Baltmore County'.
+        """
+        self.id = initial_id
+        self.name = initial_name
+        self.state = initial_state
+        self.override_name = override_name
+
+
+    def state_county_id(self):
+        return str(self.state.id) + ("%03d" % self.id)
+
+
+    def full_name(self):
+        """
+        Some of the counties (e.g. Baltimore City, Washington D.C.),
+        need to have their names overridden since they aren't
+        technically counties, but are treated as such by the Census.
+        """
+        if (self.override_name == False):
+            return self.name + ' County'
+        else:
+            # "Override name" basically means, "use the name I passed
+            # you and don't add the word 'County' on to it."
+            return self.name
+
+    
+    def tiger_data_url(self):
+        tdp =  self.state.tiger_data_url() + '/'
+        tdp += self.state_county_id()
+        tdp += '_' + self.full_name().replace(' ', '_') + '/'
+        tdp += self.zipfile_name()
+        return tdp
+
+
+    def zipfile_name(self):
+        return 'tl_2009_' + self.state_county_id() + '_edges.zip'
+
+    
+    def shapefile_path(self):
+        sfp =  self.state.lines_data_path() + '/'
+        sfp += 'tl_2009_' + self.state_county_id() + '_edges.shp'
+        return sfp
+
+
+def download_lines(states):
+    """
+    Download the TIGER/Line 'all lines' files for each county in states.
+    """
+    
+    for state in states:
+        # First, create the lines data path if it doesn't exist.
+        FileUtils.mkdir_p(state.lines_data_path(), 0755)
+    
+        # Now loop through the counties, and download/unzip the lines
+        # data if necessary.
+        for county in state.counties:
+            if not os.path.exists(county.shapefile_path()):
+                url = county.tiger_data_url()
+                tmpfile = county.zipfile_name()
+                print "Grabbing data for %s (%s)." % (county.full_name(), state.name)
+                print "Downloading %s to %s..." % (url, tmpfile)
+                
+                try:
+                    # This can fail for a bunch of reasons...
+                    urllib.urlretrieve(url, tmpfile)
+                    print "Unzipping %s to %s..." % (tmpfile, state.lines_data_path())
+                    z = zipfile.ZipFile(tmpfile)
+                    z.extractall(state.lines_data_path())
+                except:
+                    # That we don't care about.
+                    pass
+                finally:
+                    # But we always clean up after ourselves.
+                    print "Removing %s..." % tmpfile
+                    FileUtils.rm_f(tmpfile)
+                    print "Done.\n"
author	Michael Orlitzky <michael@orlitzky.com>
	Wed, 7 Oct 2009 15:02:57 +0000 (11:02 -0400)
committer	Michael Orlitzky <michael@orlitzky.com>
	Wed, 7 Oct 2009 15:02:57 +0000 (11:02 -0400)