Added an LEHD module, with classes to parse Origin-Destination matrix files.

[dead/census-tools.git] / src / LEHD.py
diff --git a/src/LEHD.py b/src/LEHD.py

new file mode 100644 (file)

index 0000000..b671504
--- /dev/null
+++ b/src/LEHD.py
@@ -0,0 +1,86 @@
+"""
+Classes for parsing LEHD (Longitudinal Employer-Household Dynamics)
+data. Inparticular, we are currently parsing the Origin-Destination
+data provided by OnTheMap:
+
+  http://www.vrdc.cornell.edu/onthemap/doc/index.html
+
+"""
+
+import os
+
+from Errors import RecordError
+
+
+class OriginDestinationRecord(object):    
+    """
+    Represents one record in an Origin-Destination matrix file. The
+    field contained within one of these files are recorded in the
+    first row of the appropriate CSV files. At this time,
+    documentation does not seem to be available for the newer version
+    3.x fields.
+    """
+    NUM_FIELDS = 13
+
+
+
+class OriginDestinationRecordParser(object):
+
+    # This is the header for an Origin-Destination matrix file. If
+    # this isn't the first line of our file, then something is wrong.
+    HEADER = 'w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3,createdate'
+    
+    def parse_file(self, path):
+        """
+        Assuming that path refers to an Origin-Destination file, parse
+        each record contained therein. These files should simply be
+        CSV with no text qualifiers, and can therefore be parsed based
+        on comma placement alone.
+        """
+
+        # Our return value, a list of records.
+        records = []
+        
+        f = open(path, 'r')
+
+        first_line = f.readline().strip()
+        if not (first_line == self.HEADER):
+            raise RecordError('According to the header (first row), this is not an Origin-Destination matrix file. The first line of an Origin-Destination matrix file should be,\n %s \nBut, the first line of your file is,\n %s\n' % (self.HEADER, first_line))
+
+        # We have already read the header line, so this process should
+        # start on the first non-header line.
+        for line in f:
+            record = self.parse_line(line)
+            records.append(record)
+            
+        f.close()
+        
+        return records
+
+
+    def parse_line(self, line):
+        """
+        Parse one line of an Origin-Destination matrix file into an
+        OriginDestinationRecord object.
+        """
+        fields = line.split(',')
+
+        if (len(fields) < OriginDestinationRecord.NUM_FIELDS):
+            raise RecordError("The line,\n %s \n does not contain enough fields. The minimum number of fields required in an Origin-Destination matrix file is %d. This line contains %d fields." % (line, OriginDestinationRecord.NUM_FIELDS, len(fields)))
+
+        od = OriginDestinationRecord()
+        od.w_geocode = fields[0]
+        od.h_geocode = fields[1]
+        od.total = fields[2]
+        od.age1 = fields[3]
+        od.age2 = fields[4]
+        od.age3 = fields[5]
+        od.earn1 = fields[6]
+        od.earn2 = fields[7]
+        od.earn3 = fields[8]
+        od.ind1 = fields[9]
+        od.ind2 = fields[10]
+        od.ind3 = fields[11]
+        od.createdate = fields[12]
+
+        return od