From fbf6257eecbc601a4f5748583d0188adc1e5ffec Mon Sep 17 00:00:00 2001
From: Michael Orlitzky <michael@orlitzky.com>
Date: Sun, 15 Nov 2009 20:26:15 -0500
Subject: [PATCH] Added an LEHD module, with classes to parse
 Origin-Destination matrix files. Added tests and fixtures for the LEHD
 module. Updated the run_tests script to include the new LEHD tests.

---
 bin/run_tests                                 |  2 +
 src/LEHD.py                                   | 86 +++++++++++++++++++
 .../Fixtures/LEHD/ten_records-no_header.csv   | 10 +++
 .../LEHD/ten_records-twelve_columns.csv       | 11 +++
 src/Tests/Fixtures/LEHD/ten_records.csv       | 11 +++
 src/Tests/Unit/LEHDTest.py                    | 32 +++++++
 6 files changed, 152 insertions(+)
 create mode 100644 src/LEHD.py
 create mode 100644 src/Tests/Fixtures/LEHD/ten_records-no_header.csv
 create mode 100644 src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv
 create mode 100644 src/Tests/Fixtures/LEHD/ten_records.csv
 create mode 100644 src/Tests/Unit/LEHDTest.py

diff --git a/bin/run_tests b/bin/run_tests
index f6b9528..69b9a88 100755
--- a/bin/run_tests
+++ b/bin/run_tests
@@ -9,6 +9,7 @@ from Tests.Unit import CensusTest
 from Tests.Unit import FileUtilsTest
 from Tests.Unit import GeometryTest
 from Tests.Unit import KMLTest
+from Tests.Unit import LEHDTest
 from Tests.Unit import SummaryFile1Test
 from Tests.Unit import StringUtilsTest
 
@@ -19,4 +20,5 @@ suite.addTest(GeometryTest.suite())
 suite.addTest(KMLTest.suite())
 suite.addTest(SummaryFile1Test.suite())
 suite.addTest(StringUtilsTest.suite())
+suite.addTest(LEHDTest.suite())
 unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/src/LEHD.py b/src/LEHD.py
new file mode 100644
index 0000000..b671504
--- /dev/null
+++ b/src/LEHD.py
@@ -0,0 +1,86 @@
+"""
+Classes for parsing LEHD (Longitudinal Employer-Household Dynamics)
+data. Inparticular, we are currently parsing the Origin-Destination
+data provided by OnTheMap:
+
+  http://www.vrdc.cornell.edu/onthemap/doc/index.html
+
+"""
+
+import os
+
+from Errors import RecordError
+
+
+class OriginDestinationRecord(object):    
+    """
+    Represents one record in an Origin-Destination matrix file. The
+    field contained within one of these files are recorded in the
+    first row of the appropriate CSV files. At this time,
+    documentation does not seem to be available for the newer version
+    3.x fields.
+    """
+    NUM_FIELDS = 13
+
+
+
+class OriginDestinationRecordParser(object):
+
+    # This is the header for an Origin-Destination matrix file. If
+    # this isn't the first line of our file, then something is wrong.
+    HEADER = 'w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3,createdate'
+    
+    def parse_file(self, path):
+        """
+        Assuming that path refers to an Origin-Destination file, parse
+        each record contained therein. These files should simply be
+        CSV with no text qualifiers, and can therefore be parsed based
+        on comma placement alone.
+        """
+
+        # Our return value, a list of records.
+        records = []
+        
+        f = open(path, 'r')
+
+        first_line = f.readline().strip()
+        if not (first_line == self.HEADER):
+            raise RecordError('According to the header (first row), this is not an Origin-Destination matrix file. The first line of an Origin-Destination matrix file should be,\n %s \nBut, the first line of your file is,\n %s\n' % (self.HEADER, first_line))
+
+        # We have already read the header line, so this process should
+        # start on the first non-header line.
+        for line in f:
+            record = self.parse_line(line)
+            records.append(record)
+            
+        f.close()
+        
+        return records
+
+
+    def parse_line(self, line):
+        """
+        Parse one line of an Origin-Destination matrix file into an
+        OriginDestinationRecord object.
+        """
+        fields = line.split(',')
+
+        if (len(fields) < OriginDestinationRecord.NUM_FIELDS):
+            raise RecordError("The line,\n %s \n does not contain enough fields. The minimum number of fields required in an Origin-Destination matrix file is %d. This line contains %d fields." % (line, OriginDestinationRecord.NUM_FIELDS, len(fields)))
+
+        od = OriginDestinationRecord()
+        od.w_geocode = fields[0]
+        od.h_geocode = fields[1]
+        od.total = fields[2]
+        od.age1 = fields[3]
+        od.age2 = fields[4]
+        od.age3 = fields[5]
+        od.earn1 = fields[6]
+        od.earn2 = fields[7]
+        od.earn3 = fields[8]
+        od.ind1 = fields[9]
+        od.ind2 = fields[10]
+        od.ind3 = fields[11]
+        od.createdate = fields[12]
+
+        return od
diff --git a/src/Tests/Fixtures/LEHD/ten_records-no_header.csv b/src/Tests/Fixtures/LEHD/ten_records-no_header.csv
new file mode 100644
index 0000000..f1e09cf
--- /dev/null
+++ b/src/Tests/Fixtures/LEHD/ten_records-no_header.csv
@@ -0,0 +1,10 @@
+240010001001011,240010001001033,1,1,0,0,0,1,0,0,1,0,20090211
+240010001001011,240010001002059,1,0,1,0,0,1,0,0,1,0,20090211
+240010001001011,240010001002070,1,1,0,0,1,0,0,0,1,0,20090211
+240010001001011,240010001002155,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240010004001008,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240010004002022,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240037401042006,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240317017032003,1,1,0,0,1,0,0,0,1,0,20090211
+240010001001011,240338005114011,1,0,0,1,1,0,0,0,1,0,20090211
+240010001001011,240430107002085,1,1,0,0,1,0,0,0,1,0,20090211
diff --git a/src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv b/src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv
new file mode 100644
index 0000000..e844028
--- /dev/null
+++ b/src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv
@@ -0,0 +1,11 @@
+w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3
+240010001001011,240010001001033,1,1,0,0,0,1,0,0,1,0
+240010001001011,240010001002059,1,0,1,0,0,1,0,0,1,0
+240010001001011,240010001002070,1,1,0,0,1,0,0,0,1,0
+240010001001011,240010001002155,1,0,1,0,1,0,0,0,1,0
+240010001001011,240010004001008,1,0,1,0,1,0,0,0,1,0
+240010001001011,240010004002022,1,0,1,0,1,0,0,0,1,0
+240010001001011,240037401042006,1,0,1,0,1,0,0,0,1,0
+240010001001011,240317017032003,1,1,0,0,1,0,0,0,1,0
+240010001001011,240338005114011,1,0,0,1,1,0,0,0,1,0
+240010001001011,240430107002085,1,1,0,0,1,0,0,0,1,0
diff --git a/src/Tests/Fixtures/LEHD/ten_records.csv b/src/Tests/Fixtures/LEHD/ten_records.csv
new file mode 100644
index 0000000..93409e8
--- /dev/null
+++ b/src/Tests/Fixtures/LEHD/ten_records.csv
@@ -0,0 +1,11 @@
+w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3,createdate
+240010001001011,240010001001033,1,1,0,0,0,1,0,0,1,0,20090211
+240010001001011,240010001002059,1,0,1,0,0,1,0,0,1,0,20090211
+240010001001011,240010001002070,1,1,0,0,1,0,0,0,1,0,20090211
+240010001001011,240010001002155,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240010004001008,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240010004002022,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240037401042006,1,0,1,0,1,0,0,0,1,0,20090211
+240010001001011,240317017032003,1,1,0,0,1,0,0,0,1,0,20090211
+240010001001011,240338005114011,1,0,0,1,1,0,0,0,1,0,20090211
+240010001001011,240430107002085,1,1,0,0,1,0,0,0,1,0,20090211
diff --git a/src/Tests/Unit/LEHDTest.py b/src/Tests/Unit/LEHDTest.py
new file mode 100644
index 0000000..ca3013e
--- /dev/null
+++ b/src/Tests/Unit/LEHDTest.py
@@ -0,0 +1,32 @@
+import unittest
+
+import Tests.Fixtures
+from Errors import RecordError
+import LEHD
+
+
+class OriginDestinationRecordParserTest(unittest.TestCase):
+
+    def setUp(self):
+        self.odrp = LEHD.OriginDestinationRecordParser()
+
+        
+    def testAllOfSubsetParsed(self):
+        fixture_path = Tests.Fixtures.Path() + '/LEHD/ten_records.csv'
+        records = self.odrp.parse_file(fixture_path)
+        self.assertEqual(len(records), 10)
+
+
+    def testErrorOnMissingColumns(self):
+        fixture_path = Tests.Fixtures.Path() + '/LEHD/ten_records-twelve_columns.csv'
+        self.assertRaises(RecordError, self.odrp.parse_file, fixture_path)
+
+
+    def testErrorOnMissingHeader(self):
+        fixture_path = Tests.Fixtures.Path() + '/LEHD/ten_records-no_header.csv'
+        self.assertRaises(RecordError, self.odrp.parse_file, fixture_path)           
+    
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(OriginDestinationRecordParserTest))
+    return suite
-- 
2.43.2