From: Michael Orlitzky Date: Mon, 16 Nov 2009 01:26:15 +0000 (-0500) Subject: Added an LEHD module, with classes to parse Origin-Destination matrix files. X-Git-Url: https://gitweb.michael.orlitzky.com/?a=commitdiff_plain;h=fbf6257eecbc601a4f5748583d0188adc1e5ffec;p=dead%2Fcensus-tools.git Added an LEHD module, with classes to parse Origin-Destination matrix files. Added tests and fixtures for the LEHD module. Updated the run_tests script to include the new LEHD tests. --- diff --git a/bin/run_tests b/bin/run_tests index f6b9528..69b9a88 100755 --- a/bin/run_tests +++ b/bin/run_tests @@ -9,6 +9,7 @@ from Tests.Unit import CensusTest from Tests.Unit import FileUtilsTest from Tests.Unit import GeometryTest from Tests.Unit import KMLTest +from Tests.Unit import LEHDTest from Tests.Unit import SummaryFile1Test from Tests.Unit import StringUtilsTest @@ -19,4 +20,5 @@ suite.addTest(GeometryTest.suite()) suite.addTest(KMLTest.suite()) suite.addTest(SummaryFile1Test.suite()) suite.addTest(StringUtilsTest.suite()) +suite.addTest(LEHDTest.suite()) unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/src/LEHD.py b/src/LEHD.py new file mode 100644 index 0000000..b671504 --- /dev/null +++ b/src/LEHD.py @@ -0,0 +1,86 @@ +""" +Classes for parsing LEHD (Longitudinal Employer-Household Dynamics) +data. Inparticular, we are currently parsing the Origin-Destination +data provided by OnTheMap: + + http://www.vrdc.cornell.edu/onthemap/doc/index.html + +""" + +import os + +from Errors import RecordError + + +class OriginDestinationRecord(object): + """ + Represents one record in an Origin-Destination matrix file. The + field contained within one of these files are recorded in the + first row of the appropriate CSV files. At this time, + documentation does not seem to be available for the newer version + 3.x fields. + """ + NUM_FIELDS = 13 + + + +class OriginDestinationRecordParser(object): + + # This is the header for an Origin-Destination matrix file. If + # this isn't the first line of our file, then something is wrong. + HEADER = 'w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3,createdate' + + def parse_file(self, path): + """ + Assuming that path refers to an Origin-Destination file, parse + each record contained therein. These files should simply be + CSV with no text qualifiers, and can therefore be parsed based + on comma placement alone. + """ + + # Our return value, a list of records. + records = [] + + f = open(path, 'r') + + first_line = f.readline().strip() + if not (first_line == self.HEADER): + raise RecordError('According to the header (first row), this is not an Origin-Destination matrix file. The first line of an Origin-Destination matrix file should be,\n %s \nBut, the first line of your file is,\n %s\n' % (self.HEADER, first_line)) + + # We have already read the header line, so this process should + # start on the first non-header line. + for line in f: + record = self.parse_line(line) + records.append(record) + + f.close() + + return records + + + def parse_line(self, line): + """ + Parse one line of an Origin-Destination matrix file into an + OriginDestinationRecord object. + """ + fields = line.split(',') + + if (len(fields) < OriginDestinationRecord.NUM_FIELDS): + raise RecordError("The line,\n %s \n does not contain enough fields. The minimum number of fields required in an Origin-Destination matrix file is %d. This line contains %d fields." % (line, OriginDestinationRecord.NUM_FIELDS, len(fields))) + + od = OriginDestinationRecord() + od.w_geocode = fields[0] + od.h_geocode = fields[1] + od.total = fields[2] + od.age1 = fields[3] + od.age2 = fields[4] + od.age3 = fields[5] + od.earn1 = fields[6] + od.earn2 = fields[7] + od.earn3 = fields[8] + od.ind1 = fields[9] + od.ind2 = fields[10] + od.ind3 = fields[11] + od.createdate = fields[12] + + return od diff --git a/src/Tests/Fixtures/LEHD/ten_records-no_header.csv b/src/Tests/Fixtures/LEHD/ten_records-no_header.csv new file mode 100644 index 0000000..f1e09cf --- /dev/null +++ b/src/Tests/Fixtures/LEHD/ten_records-no_header.csv @@ -0,0 +1,10 @@ +240010001001011,240010001001033,1,1,0,0,0,1,0,0,1,0,20090211 +240010001001011,240010001002059,1,0,1,0,0,1,0,0,1,0,20090211 +240010001001011,240010001002070,1,1,0,0,1,0,0,0,1,0,20090211 +240010001001011,240010001002155,1,0,1,0,1,0,0,0,1,0,20090211 +240010001001011,240010004001008,1,0,1,0,1,0,0,0,1,0,20090211 +240010001001011,240010004002022,1,0,1,0,1,0,0,0,1,0,20090211 +240010001001011,240037401042006,1,0,1,0,1,0,0,0,1,0,20090211 +240010001001011,240317017032003,1,1,0,0,1,0,0,0,1,0,20090211 +240010001001011,240338005114011,1,0,0,1,1,0,0,0,1,0,20090211 +240010001001011,240430107002085,1,1,0,0,1,0,0,0,1,0,20090211 diff --git a/src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv b/src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv new file mode 100644 index 0000000..e844028 --- /dev/null +++ b/src/Tests/Fixtures/LEHD/ten_records-twelve_columns.csv @@ -0,0 +1,11 @@ +w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3 +240010001001011,240010001001033,1,1,0,0,0,1,0,0,1,0 +240010001001011,240010001002059,1,0,1,0,0,1,0,0,1,0 +240010001001011,240010001002070,1,1,0,0,1,0,0,0,1,0 +240010001001011,240010001002155,1,0,1,0,1,0,0,0,1,0 +240010001001011,240010004001008,1,0,1,0,1,0,0,0,1,0 +240010001001011,240010004002022,1,0,1,0,1,0,0,0,1,0 +240010001001011,240037401042006,1,0,1,0,1,0,0,0,1,0 +240010001001011,240317017032003,1,1,0,0,1,0,0,0,1,0 +240010001001011,240338005114011,1,0,0,1,1,0,0,0,1,0 +240010001001011,240430107002085,1,1,0,0,1,0,0,0,1,0 diff --git a/src/Tests/Fixtures/LEHD/ten_records.csv b/src/Tests/Fixtures/LEHD/ten_records.csv new file mode 100644 index 0000000..93409e8 --- /dev/null +++ b/src/Tests/Fixtures/LEHD/ten_records.csv @@ -0,0 +1,11 @@ +w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3,createdate +240010001001011,240010001001033,1,1,0,0,0,1,0,0,1,0,20090211 +240010001001011,240010001002059,1,0,1,0,0,1,0,0,1,0,20090211 +240010001001011,240010001002070,1,1,0,0,1,0,0,0,1,0,20090211 +240010001001011,240010001002155,1,0,1,0,1,0,0,0,1,0,20090211 +240010001001011,240010004001008,1,0,1,0,1,0,0,0,1,0,20090211 +240010001001011,240010004002022,1,0,1,0,1,0,0,0,1,0,20090211 +240010001001011,240037401042006,1,0,1,0,1,0,0,0,1,0,20090211 +240010001001011,240317017032003,1,1,0,0,1,0,0,0,1,0,20090211 +240010001001011,240338005114011,1,0,0,1,1,0,0,0,1,0,20090211 +240010001001011,240430107002085,1,1,0,0,1,0,0,0,1,0,20090211 diff --git a/src/Tests/Unit/LEHDTest.py b/src/Tests/Unit/LEHDTest.py new file mode 100644 index 0000000..ca3013e --- /dev/null +++ b/src/Tests/Unit/LEHDTest.py @@ -0,0 +1,32 @@ +import unittest + +import Tests.Fixtures +from Errors import RecordError +import LEHD + + +class OriginDestinationRecordParserTest(unittest.TestCase): + + def setUp(self): + self.odrp = LEHD.OriginDestinationRecordParser() + + + def testAllOfSubsetParsed(self): + fixture_path = Tests.Fixtures.Path() + '/LEHD/ten_records.csv' + records = self.odrp.parse_file(fixture_path) + self.assertEqual(len(records), 10) + + + def testErrorOnMissingColumns(self): + fixture_path = Tests.Fixtures.Path() + '/LEHD/ten_records-twelve_columns.csv' + self.assertRaises(RecordError, self.odrp.parse_file, fixture_path) + + + def testErrorOnMissingHeader(self): + fixture_path = Tests.Fixtures.Path() + '/LEHD/ten_records-no_header.csv' + self.assertRaises(RecordError, self.odrp.parse_file, fixture_path) + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(OriginDestinationRecordParserTest)) + return suite