Added an LEHD module, with classes to parse Origin-Destination matrix files.

author Michael Orlitzky <michael@orlitzky.com>

Mon, 16 Nov 2009 01:26:15 +0000 (20:26 -0500)

committer Michael Orlitzky <michael@orlitzky.com>

Mon, 16 Nov 2009 01:26:15 +0000 (20:26 -0500)
author Michael Orlitzky <michael@orlitzky.com>
Mon, 16 Nov 2009 01:26:15 +0000 (20:26 -0500)
committer Michael Orlitzky <michael@orlitzky.com>
Mon, 16 Nov 2009 01:26:15 +0000 (20:26 -0500)
diff --git a/bin/run_tests b/bin/run_tests

index f6b952881e6165aad3538a4dde646beb75e10040..69b9a8818a541ee838a2753b50e94a05ca3afe0a 100755 (executable)
--- a/bin/run_tests
+++ b/bin/run_tests
@@ -9,6 +9,7 @@ from Tests.Unit import CensusTest
  from Tests.Unit import FileUtilsTest
  from Tests.Unit import GeometryTest
  from Tests.Unit import KMLTest
+from Tests.Unit import LEHDTest
  from Tests.Unit import SummaryFile1Test
  from Tests.Unit import StringUtilsTest
  
@@ -19,4 +20,5 @@ suite.addTest(GeometryTest.suite())
  suite.addTest(KMLTest.suite())
  suite.addTest(SummaryFile1Test.suite())
  suite.addTest(StringUtilsTest.suite())
+suite.addTest(LEHDTest.suite())
  unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/src/LEHD.py b/src/LEHD.py

new file mode 100644 (file)

index 0000000..b671504
--- /dev/null
+++ b/src/LEHD.py
@@ -0,0 +1,86 @@
+"""
+Classes for parsing LEHD (Longitudinal Employer-Household Dynamics)
+data. Inparticular, we are currently parsing the Origin-Destination
+data provided by OnTheMap:
+
+  http://www.vrdc.cornell.edu/onthemap/doc/index.html
+
+"""
+
+import os
+
+from Errors import RecordError
+
+
+class OriginDestinationRecord(object):    
+    """
+    Represents one record in an Origin-Destination matrix file. The
+    field contained within one of these files are recorded in the
+    first row of the appropriate CSV files. At this time,
+    documentation does not seem to be available for the newer version
+    3.x fields.
+    """
+    NUM_FIELDS = 13
+
+
+
+class OriginDestinationRecordParser(object):
+
+    # This is the header for an Origin-Destination matrix file. If
+    # this isn't the first line of our file, then something is wrong.
+    HEADER = 'w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3,createdate'
+    
+    def parse_file(self, path):
+        """
+        Assuming that path refers to an Origin-Destination file, parse
+        each record contained therein. These files should simply be
+        CSV with no text qualifiers, and can therefore be parsed based
+        on comma placement alone.
+        """
+
+        # Our return value, a list of records.
+        records = []
+        
+        f = open(path, 'r')
+
+        first_line = f.readline().strip()
+        if not (first_line == self.HEADER):
+            raise RecordError('According to the header (first row), this is not an Origin-Destination matrix file. The first line of an Origin-Destination matrix file should be,\n %s \nBut, the first line of your file is,\n %s\n' % (self.HEADER, first_line))
+
+        # We have already read the header line, so this process should
+        # start on the first non-header line.
+        for line in f:
+            record = self.parse_line(line)
+            records.append(record)
+            
+        f.close()
+        
+        return records
+
+
+    def parse_line(self, line):
+        """
+        Parse one line of an Origin-Destination matrix file into an
+        OriginDestinationRecord object.
+        """
+        fields = line.split(',')
+
+        if (len(fields) < OriginDestinationRecord.NUM_FIELDS):
+            raise RecordError("The line,\n %s \n does not contain enough fields. The minimum number of fields required in an Origin-Destination matrix file is %d. This line contains %d fields." % (line, OriginDestinationRecord.NUM_FIELDS, len(fields)))
+
+        od = OriginDestinationRecord()
+        od.w_geocode = fields[0]
+        od.h_geocode = fields[1]
+        od.total = fields[2]
+        od.age1 = fields[3]
+        od.age2 = fields[4]
+        od.age3 = fields[5]
+        od.earn1 = fields[6]
+        od.earn2 = fields[7]
+        od.earn3 = fields[8]
+        od.ind1 = fields[9]
+        od.ind2 = fields[10]
+        od.ind3 = fields[11]
+        od.createdate = fields[12]
+
+        return od
diff --git a/src/Tests/Fixtures/LEHD/ten_records-no_header.csv b/src/Tests/Fixtures/LEHD/ten_records-no_header.csv

new file mode 100644 (file)

index 0000000..f1e09cf
--- /dev/null
+++ b/src/Tests/Fixtures/LEHD/ten_records-no_header.csv
@@ -0,0 +1,10 @@
+240010001001011,240010001001033,1,1,0,0,0,1,0,0,1,0,20090211
+240010001001011,240010001002059,1,0,1,0,0,1,0,0,1,0,20090211
+240010001001011,240010001002070,1,1,0,0,1,0,0,0,1,0,20090211
+240010001001011,240010001002155,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240010004001008,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240010004002022,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240037401042006,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240317017032003,1,1,0,0,1,0,0,0,1,0,20090211
+240010001001011,240338005114011,1,0,0,1,1,0,0,0,1,0,20090211
+240010001001011,240430107002085,1,1,0,0,1,0,0,0,1,0,20090211
diff --git a/src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv b/src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv

new file mode 100644 (file)

index 0000000..e844028
--- /dev/null
+++ b/src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv
@@ -0,0 +1,11 @@
+w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3
+240010001001011,240010001001033,1,1,0,0,0,1,0,0,1,0
+240010001001011,240010001002059,1,0,1,0,0,1,0,0,1,0
+240010001001011,240010001002070,1,1,0,0,1,0,0,0,1,0
+240010001001011,240010001002155,1,0,1,0,1,0,0,0,1,0
+240010001001011,240010004001008,1,0,1,0,1,0,0,0,1,0
+240010001001011,240010004002022,1,0,1,0,1,0,0,0,1,0
+240010001001011,240037401042006,1,0,1,0,1,0,0,0,1,0
+240010001001011,240317017032003,1,1,0,0,1,0,0,0,1,0
+240010001001011,240338005114011,1,0,0,1,1,0,0,0,1,0
+240010001001011,240430107002085,1,1,0,0,1,0,0,0,1,0
diff --git a/src/Tests/Fixtures/LEHD/ten_records.csv b/src/Tests/Fixtures/LEHD/ten_records.csv

new file mode 100644 (file)

index 0000000..93409e8
--- /dev/null
+++ b/src/Tests/Fixtures/LEHD/ten_records.csv
@@ -0,0 +1,11 @@
+w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3,createdate
+240010001001011,240010001001033,1,1,0,0,0,1,0,0,1,0,20090211
+240010001001011,240010001002059,1,0,1,0,0,1,0,0,1,0,20090211
+240010001001011,240010001002070,1,1,0,0,1,0,0,0,1,0,20090211
+240010001001011,240010001002155,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240010004001008,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240010004002022,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240037401042006,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240317017032003,1,1,0,0,1,0,0,0,1,0,20090211
+240010001001011,240338005114011,1,0,0,1,1,0,0,0,1,0,20090211
+240010001001011,240430107002085,1,1,0,0,1,0,0,0,1,0,20090211
diff --git a/src/Tests/Unit/LEHDTest.py b/src/Tests/Unit/LEHDTest.py

new file mode 100644 (file)

index 0000000..ca3013e
--- /dev/null
+++ b/src/Tests/Unit/LEHDTest.py
@@ -0,0 +1,32 @@
+import unittest
+
+import Tests.Fixtures
+from Errors import RecordError
+import LEHD
+
+
+class OriginDestinationRecordParserTest(unittest.TestCase):
+
+    def setUp(self):
+        self.odrp = LEHD.OriginDestinationRecordParser()
+
+        
+    def testAllOfSubsetParsed(self):
+        fixture_path = Tests.Fixtures.Path() + '/LEHD/ten_records.csv'
+        records = self.odrp.parse_file(fixture_path)
+        self.assertEqual(len(records), 10)
+
+
+    def testErrorOnMissingColumns(self):
+        fixture_path = Tests.Fixtures.Path() + '/LEHD/ten_records-twelve_columns.csv'
+        self.assertRaises(RecordError, self.odrp.parse_file, fixture_path)
+
+
+    def testErrorOnMissingHeader(self):
+        fixture_path = Tests.Fixtures.Path() + '/LEHD/ten_records-no_header.csv'
+        self.assertRaises(RecordError, self.odrp.parse_file, fixture_path)           
+    
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(OriginDestinationRecordParserTest))
+    return suite
author	Michael Orlitzky <michael@orlitzky.com>
	Mon, 16 Nov 2009 01:26:15 +0000 (20:26 -0500)
committer	Michael Orlitzky <michael@orlitzky.com>
	Mon, 16 Nov 2009 01:26:15 +0000 (20:26 -0500)
bin/run_tests		patch \| blob \| history
src/LEHD.py	[new file with mode: 0644]	patch \| blob
src/Tests/Fixtures/LEHD/ten_records-no_header.csv	[new file with mode: 0644]	patch \| blob
src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv	[new file with mode: 0644]	patch \| blob
src/Tests/Fixtures/LEHD/ten_records.csv	[new file with mode: 0644]	patch \| blob
src/Tests/Unit/LEHDTest.py	[new file with mode: 0644]	patch \| blob