Last active
December 11, 2015 19:38
-
-
Save nickbarnwell/4649575 to your computer and use it in GitHub Desktop.
Python script for parsing the MSFT S13 Intern Start Date Document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import string | |
import sys | |
import re | |
RE_DATE = re.compile("([a-zA-Z]+) (\d+)$") | |
RE_INTERN = re.compile("(.+) \((.+)\)") | |
def parse_intern(line): | |
name, data = RE_INTERN.match(line).group(1,2) | |
intern = {'name': name} | |
#I can't believe this returns None instead of a new copy of the dict. | |
#So impure | |
intern.update(parse_data(data)) | |
return intern | |
def parse_data(data): | |
parse_date = lambda x: int(x[0]) | |
data = map(string.strip, data.split(',')) | |
return { 'position': data[0], | |
'team': data[1], | |
'num': parse_date(data[2]) | |
} | |
def process_interns_file(filename): | |
interns = [] | |
with open('dates.txt', 'r') as f: | |
for line in f: | |
if RE_DATE.match(line): | |
date = line.strip() | |
line = f.next() | |
while RE_INTERN.match(line): | |
intern = parse_intern(line) | |
intern['start'] = date | |
interns.append(intern) | |
line = f.next() | |
return interns | |
def output_csv(fname, data): | |
with open(fname, 'w') as f: | |
writer = csv.DictWriter(f, ['name', 'position', 'team', 'num', 'start'], restval='N/A') | |
writer.writeheader() | |
for row in data: | |
writer.writerow(row) | |
if __name__ == '__main__': | |
args = sys.argv #lolgoodpractice | |
if len(args) < 3: | |
print "Usage: python intern_parser.py infile outfile" | |
else: | |
output_csv(sys.argv[2], process_interns_file(sys.argv[1])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment