Created
March 7, 2012 14:28
-
-
Save hodbby/1993457 to your computer and use it in GitHub Desktop.
goog regex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_names(filename): | |
""" | |
Given a file name for baby.html, returns a list starting with the year string | |
followed by the name-rank strings in alphabetical order. | |
['2006', 'Aaliyah 91', Aaron 57', 'Abagail 895', ' ...] | |
""" | |
# +++your code here+++ | |
# LAB(begin solution) | |
# The list [year, name_and_rank, name_and_rank, ...] we'll eventually return. | |
names = [] | |
# Open and read the file. | |
f = open(filename, 'rU') | |
text = f.read() | |
# Could process the file line-by-line, but regex on the whole text | |
# at once is even easier. | |
# Get the year. | |
year_match = re.search(r'Popularity\sin\s(\d\d\d\d)', text) | |
if not year_match: | |
# We didn't find a year, so we'll exit with an error message. | |
sys.stderr.write('Couldn\'t find the year!\n') | |
sys.exit(1) | |
year = year_match.group(1) | |
names.append(year) | |
# Extract all the data tuples with a findall() | |
# each tuple is: (rank, boy-name, girl-name) | |
tuples = re.findall(r'<td>(\d+)</td><td>(\w+)</td>\<td>(\w+)</td>', text) | |
#print tuples | |
# Store data into a dict using each name as a key and that | |
# name's rank number as the value. | |
# (if the name is already in there, don't add it, since | |
# this new rank will be bigger than the previous rank). | |
names_to_rank = {} | |
for rank_tuple in tuples: | |
(rank, boyname, girlname) = rank_tuple # unpack the tuple into 3 vars | |
if boyname not in names_to_rank: | |
names_to_rank[boyname] = rank | |
if girlname not in names_to_rank: | |
names_to_rank[girlname] = rank | |
# You can also write: | |
# for rank, boyname, girlname in tuples: | |
# ... | |
# To unpack the tuples inside a for-loop. | |
# Get the names, sorted in the right order | |
sorted_names = sorted(names_to_rank.keys()) | |
# Build up result list, one element per line | |
for name in sorted_names: | |
names.append(name + " " + names_to_rank[name]) | |
return names | |
# LAB(replace solution) | |
# return | |
# LAB(end solution) | |
def main(): | |
# This command-line parsing code is provided. | |
# Make a list of command line arguments, omitting the [0] element | |
# which is the script itself. | |
args = sys.argv[1:] | |
if not args: | |
print 'usage: [--summaryfile] file [file ...]' | |
sys.exit(1) | |
# Notice the summary flag and remove it from args if it is present. | |
summary = False | |
if args[0] == '--summaryfile': | |
summary = True | |
del args[0] | |
# +++your code here+++ | |
# For each filename, get the names, then either print the text output | |
# or write it to a summary file | |
# LAB(begin solution) | |
for filename in args: | |
names = extract_names(filename) | |
# Make text out of the whole list | |
text = '\n'.join(names) | |
if summary: | |
outf = open(filename + '.summary', 'w') | |
outf.write(text + '\n') | |
outf.close() | |
else: | |
print text | |
# LAB(end solution) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment