]> gitweb.michael.orlitzky.com - dead/census-tools.git/blob - src/LEHD.py
Added an LEHD module, with classes to parse Origin-Destination matrix files.
[dead/census-tools.git] / src / LEHD.py
1 """
2 Classes for parsing LEHD (Longitudinal Employer-Household Dynamics)
3 data. Inparticular, we are currently parsing the Origin-Destination
4 data provided by OnTheMap:
5
6 http://www.vrdc.cornell.edu/onthemap/doc/index.html
7
8 """
9
10 import os
11
12 from Errors import RecordError
13
14
15 class OriginDestinationRecord(object):
16 """
17 Represents one record in an Origin-Destination matrix file. The
18 field contained within one of these files are recorded in the
19 first row of the appropriate CSV files. At this time,
20 documentation does not seem to be available for the newer version
21 3.x fields.
22 """
23 NUM_FIELDS = 13
24
25
26
27 class OriginDestinationRecordParser(object):
28
29 # This is the header for an Origin-Destination matrix file. If
30 # this isn't the first line of our file, then something is wrong.
31 HEADER = 'w_geocode,h_geocode,total,age1,age2,age3,earn1,earn2,earn3,ind1,ind2,ind3,createdate'
32
33 def parse_file(self, path):
34 """
35 Assuming that path refers to an Origin-Destination file, parse
36 each record contained therein. These files should simply be
37 CSV with no text qualifiers, and can therefore be parsed based
38 on comma placement alone.
39 """
40
41 # Our return value, a list of records.
42 records = []
43
44 f = open(path, 'r')
45
46 first_line = f.readline().strip()
47 if not (first_line == self.HEADER):
48 raise RecordError('According to the header (first row), this is not an Origin-Destination matrix file. The first line of an Origin-Destination matrix file should be,\n %s \nBut, the first line of your file is,\n %s\n' % (self.HEADER, first_line))
49
50 # We have already read the header line, so this process should
51 # start on the first non-header line.
52 for line in f:
53 record = self.parse_line(line)
54 records.append(record)
55
56 f.close()
57
58 return records
59
60
61 def parse_line(self, line):
62 """
63 Parse one line of an Origin-Destination matrix file into an
64 OriginDestinationRecord object.
65 """
66 fields = line.split(',')
67
68 if (len(fields) < OriginDestinationRecord.NUM_FIELDS):
69 raise RecordError("The line,\n %s \n does not contain enough fields. The minimum number of fields required in an Origin-Destination matrix file is %d. This line contains %d fields." % (line, OriginDestinationRecord.NUM_FIELDS, len(fields)))
70
71 od = OriginDestinationRecord()
72 od.w_geocode = fields[0]
73 od.h_geocode = fields[1]
74 od.total = fields[2]
75 od.age1 = fields[3]
76 od.age2 = fields[4]
77 od.age3 = fields[5]
78 od.earn1 = fields[6]
79 od.earn2 = fields[7]
80 od.earn3 = fields[8]
81 od.ind1 = fields[9]
82 od.ind2 = fields[10]
83 od.ind3 = fields[11]
84 od.createdate = fields[12]
85
86 return od