""" Classes for parsing LEHD (Longitudinal Employer-Household Dynamics) data. Inparticular, we are currently parsing the Origin-Destination data provided by OnTheMap: http://www.vrdc.cornell.edu/onthemap/doc/index.html """ import os from Errors import RecordError class OriginDestinationRecord(object): """ Represents one record in an Origin-Destination matrix file. The field contained within one of these files are recorded in the first row of the appropriate CSV files. At this time, documentation does not seem to be available for the newer version 3.x fields. """ NUM_FIELDS = 13 class OriginDestinationRecordParser(object): # This is the header for an Origin-Destination matrix file. If # this isn't the first line of our file, then something is wrong. HEADER = 'w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3,createdate' def parse_file(self, path): """ Assuming that path refers to an Origin-Destination file, parse each record contained therein. These files should simply be CSV with no text qualifiers, and can therefore be parsed based on comma placement alone. """ # Our return value, a list of records. records = [] f = open(path, 'r') first_line = f.readline().strip() if not (first_line == self.HEADER): raise RecordError('According to the header (first row), this is not an Origin-Destination matrix file. The first line of an Origin-Destination matrix file should be,\n %s \nBut, the first line of your file is,\n %s\n' % (self.HEADER, first_line)) # We have already read the header line, so this process should # start on the first non-header line. for line in f: record = self.parse_line(line) records.append(record) f.close() return records def parse_line(self, line): """ Parse one line of an Origin-Destination matrix file into an OriginDestinationRecord object. """ fields = line.split(',') if (len(fields) < OriginDestinationRecord.NUM_FIELDS): raise RecordError("The line,\n %s \n does not contain enough fields. The minimum number of fields required in an Origin-Destination matrix file is %d. This line contains %d fields." % (line, OriginDestinationRecord.NUM_FIELDS, len(fields))) od = OriginDestinationRecord() od.w_geocode = fields[0] od.h_geocode = fields[1] od.total = fields[2] od.age1 = fields[3] od.age2 = fields[4] od.age3 = fields[5] od.earn1 = fields[6] od.earn2 = fields[7] od.earn3 = fields[8] od.ind1 = fields[9] od.ind2 = fields[10] od.ind3 = fields[11] od.createdate = fields[12] return od