Skip to content

Instantly share code, notes, and snippets.

@bycoffe
Created September 2, 2010 17:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bycoffe/562606 to your computer and use it in GitHub Desktop.
Save bycoffe/562606 to your computer and use it in GitHub Desktop.
class FixedWidthParser(object):
"""A parser for fixed-width data files. Pass in a data file and
a list of field names and lengths, and get back a dictionary
for each row.
Useful for converting a fixed-width file to a CSV.
See tests.py for a usage example.
"""
def __init__(self, fields):
"""
fields: a list of tuples in the form (fieldname, (startchar, endchar))
"""
self.fields = fields
def parse(self, fh):
"""
fh: a file-like object
"""
for line in fh:
data = {}
for fieldname, (start, end) in self.fields:
data[fieldname] = line[start-1:end]
yield data
import unittest
from fixedwidthparser import FixedWidthParser
class TestFixedWidthParser(unittest.TestCase):
def test_parser(self):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
fields = [('filer_id', (1,9)),
('amendment', (10, 10)),
('report_type', (11, 13)),
('primary_general', (14, 14)),
('microfilm', (15, 25)),
('transaction_type', (26, 28)),
('contributor_name', (29, 62)),
('city', (63, 80)),
('state', (81, 82)),
('zipcode', (83, 87)),
('occupation', (88, 122)),
('month', (123, 124)),
('day', (125, 126)),
('century', (127, 128)),
('year', (129, 130)),
('amount', (131, 137)),
('other_id', (138, 146)),
('fec_record', (147, 153)), ]
data = """C00000042NYE P2893017625424KCongress for Judy Biggert Clarendon Hills IL60514 102220070002000C003302411367020
C00000042NYE P2893017625424KFriends of John Boehner Hamilton OH45011 102220070005000C002371981367021
C00000042NYE P2893017625424KCapito for Congress Charleston WV25314 102220070001000C003478491367022
C00000042NYE P2893017625524KCongressman for John Carter Round Rock TX78664 102220070001000C003712031367023"""
fh = StringIO(data)
parser = FixedWidthParser(fields)
for row in parser.parse(fh):
self.assertEqual(row['century'], '20')
self.assertEqual(row['year'], '07')
self.assertTrue(row['city'].strip() in ['Clarendon Hills', 'Hamilton', 'Charleston', 'Round Rock'])
print row
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment