Last active
December 10, 2015 23:18
-
-
Save paulsmith/4507999 to your computer and use it in GitHub Desktop.
Parsing a list of 2013 Oscar nominees into CSV. (The raw text is formatted strangely w.r.t. case because it was copy-pasted from a PDF.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BEST PICTURE | |
Amour | |
TBD, Producer | |
Argo | |
Grant Heslov, Ben Affleck and George Clooney, Producers | |
BeAsts of the southern Wild | |
Dan Janvey, Josh Penn and Michael Gottwald, Producers | |
djAngo unchAined | |
Stacey Sher, Reginald Hudlin and Pilar Savone, Producers | |
les misérABles | |
Tim Bevan, Eric Fellner, Debra Hayward and Cameron Mackintosh, Producers | |
life of Pi | |
Gil Netter, Ang Lee and David Womark, Producers | |
lincoln | |
Steven Spielberg and Kathleen Kennedy, Producers | |
silver linings PlAyBook | |
Donna Gigliotti, Bruce Cohen and Jonathan Gordon, Producers | |
Zero dArk thirty | |
Mark Boal, Kathryn Bigelow and Megan Ellison, Producers | |
ACToR In A LEAdIng RoLE | |
SiLvER LiNiNGS PLAyBooK | |
Bradley cooper | |
LiNCoLN | |
daniel day-lewis | |
LES MiSéRABLES | |
hugh jackman | |
THE MASTER | |
joaquin Phoenix | |
FLiGHT | |
denzel Washington | |
ACToR In A SUPPoRTIng RoLE | |
ARGo | |
Alan Arkin | |
SiLvER LiNiNGS PLAyBooK | |
robert de niro | |
THE MASTER | |
Philip seymour hoffman | |
LiNCoLN | |
tommy lee jones | |
DJANGo UNCHAiNED | |
christoph Waltz | |
ACTRESS In A LEAdIng RoLE | |
ZERo DARK THiRTy | |
jessica chastain | |
SiLvER LiNiNGS PLAyBooK | |
jennifer lawrence | |
AMoUR | |
emmanuelle riva | |
BEASTS oF THE SoUTHERN WiLD | |
Quvenzhané Wallis | |
THE iMPoSSiBLE | |
naomi Watts | |
ACTRESS In A SUPPoRTIng RoLE | |
THE MASTER | |
Amy Adams | |
LiNCoLN | |
sally field | |
LES MiSéRABLES | |
Anne hathaway | |
THE SESSioNS | |
helen hunt | |
SiLvER LiNiNGS PLAyBooK | |
jacki Weaver | |
AnImATEd FEATURE FILm | |
BrAve | |
Mark Andrews and Brenda Chapman | |
frAnkenWeenie | |
Tim Burton | |
PArAnormAn | |
Sam Fell and Chris Butler | |
the PirAtes! BAnd of misfits | |
Peter Lord | |
Wreck-it rAlPh | |
Rich Moore | |
CInEmATogRAPHY | |
AnnA kAreninA | |
Seamus McGarvey | |
djAngo unchAined | |
Robert Richardson | |
life of Pi | |
Claudio Miranda | |
lincoln | |
Janusz Kaminski | |
skyfAll | |
Roger Deakins | |
CoSTUmE dESIgn | |
AnnA kAreninA | |
Jacqueline Durran | |
les misérABles | |
Paco Delgado | |
lincoln | |
Joanna Johnston | |
mirror mirror | |
Eiko ishioka | |
snoW White And the huntsmAn | |
Colleen Atwood | |
dIRECTIng | |
Amour | |
Michael Haneke | |
BeAsts of the southern Wild | |
Benh Zeitlin | |
life of Pi | |
Ang Lee | |
lincoln | |
Steven Spielberg | |
silver linings PlAyBook | |
David o. Russell | |
doCUmEnTARY FEATURE | |
5 Broken cAmerAs | |
Emad Burnat and Guy Davidi | |
the gAtekeePers | |
TBD | |
hoW to survive A PlAgue | |
TBD | |
the invisiBle WAr | |
TBD | |
seArching for sugAr mAn | |
TBD | |
doCUmEnTARY SHoRT SUBjECT | |
inocente | |
Sean Fine and Andrea Nix Fine | |
kings Point | |
Sari Gilman and Jedd Wider | |
mondAys At rAcine | |
Cynthia Wade and Robin Honan | |
oPen heArt | |
Kief Davidson and Cori Shepherd Stern | |
redemPtion | |
Jon Alpert and Matthew o’Neill | |
FILm EdITIng | |
Argo | |
William Goldenberg | |
life of Pi | |
Tim Squyres | |
lincoln | |
Michael Kahn | |
silver linings PlAyBook | |
Jay Cassidy and Crispin Struthers | |
Zero dArk thirty | |
Dylan Tichenor and William Goldenberg | |
FoREIgn LAngUAgE FILm | |
Amour | |
Austria | |
kon-tiki | |
Norway | |
no | |
Chile | |
A royAl AffAir | |
Denmark | |
WAr Witch | |
Canada | |
mAKEUP And HAIRSTYLIng | |
hitchcock | |
Howard Berger, Peter Montagna and Martin Samuel | |
the hoBBit: An unexPected journey | |
Peter Swords King, Rick Findlater and Tami Lane | |
les misérABles | |
Lisa Westcott and Julie Dartnell | |
mUSIC (oRIgInAL SCoRE) | |
AnnA kAreninA | |
Dario Marianelli | |
Argo | |
Alexandre Desplat | |
life of Pi | |
Mychael Danna | |
lincoln | |
John Williams | |
skyfAll | |
Thomas Newman | |
mUSIC (oRIgInAL Song) | |
CHASiNG iCE | |
“Before my time”, Music and Lyric by J. Ralph | |
TED | |
“Everybody needs A Best friend”, Music by Walter Murphy, Lyric by Seth MacFarlane | |
LiFE oF Pi | |
“Pi’s lullaby”, Music by Mychael Danna, Lyric by Bombay Jayashri | |
SKyFALL | |
“skyfall”, Music and Lyric by Adele Adkins and Paul Epworth | |
LES MiSéRABLES | |
“suddenly”, Music by Claude-Michel Schönberg, Lyric by Herbert Kretzmer and Alain Boublil | |
PRodUCTIon dESIgn | |
AnnA kAreninA | |
Production Design: Sarah Greenwood, Set Decoration: Katie Spencer | |
the hoBBit: An unexPected journey | |
Production Design: Dan Hennah, Set Decoration: Ra vincent and Simon Bright | |
les misérABles | |
Production Design: Eve Stewart, Set Decoration: Anna Lynch-Robinson | |
life of Pi | |
Production Design: David Gropman, Set Decoration: Anna Pinnock | |
lincoln | |
Production Design: Rick Carter, Set Decoration: Jim Erickson | |
SHoRT FILm (AnImATEd) | |
AdAm And dog | |
Minkyu Lee | |
fresh guAcAmole | |
PES | |
heAd over heels | |
Timothy Reckart and Fodhla Cronin o’Reilly | |
mAggie simPson in “the longest dAycAre” | |
David Silverman | |
PAPermAn | |
John Kahrs | |
SHoRT FILm (LIvE ACTIon) | |
AsAd | |
Bryan Buckley and Mino Jarjoura | |
BuZkAshi Boys | |
Sam French and Ariel Nasr | |
curfeW | |
Shawn Christensen | |
deAth of A shAdoW (dood vAn een schAduW) | |
Tom van Avermaet and Ellen De Waele | |
henry | |
yan England | |
SoUnd EdITIng | |
Argo | |
Erik Aadahl and Ethan van der Ryn | |
djAngo unchAined | |
Wylie Stateman | |
life of Pi | |
Eugene Gearty and Philip Stockton | |
skyfAll | |
Per Hallberg and Karen Baker Landers | |
Zero dArk thirty | |
Paul N.J. ottosson | |
SoUnd mIXIng | |
Argo | |
John Reitz, Gregg Rudloff and Jose Antonio Garcia | |
les misérABles | |
Andy Nelson, Mark Paterson and Simon Hayes | |
life of Pi | |
Ron Bartlett, D.M. Hemphill and Drew Kunin | |
lincoln | |
Andy Nelson, Gary Rydstrom and Ronald Judkins | |
skyfAll | |
Scott Millan, Greg P. Russell and Stuart Wilson | |
vISUAL EFFECTS | |
the hoBBit: An unexPected journey | |
Joe Letteri, Eric Saindon, David Clayton and R. Christopher White | |
life of Pi | |
Bill Westenhofer, Guillaume Rocheron, Erik-Jan De Boer and Donald R. Elliott | |
mArvel’s the Avengers | |
Janek Sirrs, Jeff White, Guy Williams and Dan Sudick | |
Prometheus | |
Richard Stammers, Trevor Wood, Charley Henley and Martin Hill | |
snoW White And the huntsmAn | |
Cedric Nicolas-Troyan, Philip Brennan, Neil Corbould and Michael Dawson | |
WRITIng (AdAPTEd SCREEnPLAY) | |
Argo | |
Screenplay by Chris Terrio | |
BeAsts of the southern Wild | |
Screenplay by Lucy Alibar & Benh Zeitlin | |
life of Pi | |
Screenplay by David Magee | |
lincoln | |
Screenplay by Tony Kushner | |
silver linings PlAyBook | |
Screenplay by David o. Russell | |
WRITIng (oRIgInAL SCREEnPLAY) | |
Amour | |
Written by Michael Haneke | |
djAngo unchAined | |
Written by Quentin Tarantino | |
flight | |
Written by John Gatins | |
moonrise kingdom | |
Written by Wes Anderson & Roman Coppola | |
Zero dArk thirty | |
Written by Mark Boal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Scans and parses a very specific form of input, text converted from a PDF list | |
of nominees from the Academy Awards website, into a CSV file | |
""" | |
import csv | |
import sys | |
from collections import namedtuple | |
# token types. | |
CATEGORY, FILM, NAMES, BLANK = range(4) | |
Nominee = namedtuple('Nominee', ('film', 'names')) | |
class Category: | |
def __init__(self, name): | |
self.name = name | |
self.nominees = [] | |
class Lexer: | |
def __init__(self): | |
self.token = None | |
def run(self, start_state): | |
state = start_state | |
while state is not None: | |
if self.token is not None: | |
yield self.token | |
state = state(self) | |
def emit(self, toktype, value): | |
self.token = (toktype, value) | |
class Parser: | |
def __init__(self, lexer): | |
self.lexer = lexer | |
self.cat = None | |
self.film = None | |
def reset(self): | |
self.cat = None | |
self.film = None | |
def parse(self): | |
categories = [] | |
for toktype, value in self.lexer.run(lex_category): | |
if toktype == CATEGORY: | |
self.cat = Category(value) | |
elif toktype == FILM: | |
self.film = value | |
elif toktype == NAMES: | |
self.cat.nominees.append(Nominee(self.film, value)) | |
elif toktype == BLANK: | |
categories.append(self.cat) | |
self.reset() | |
if self.cat is not None: | |
categories.append(self.cat) | |
return categories | |
def getline(): | |
line = sys.stdin.readline() | |
if line == '': | |
return None | |
return line.strip() | |
def title(s): | |
return s.title() | |
def lex_category(lexer): | |
lexer.emit(CATEGORY, title(getline())) | |
return lex_film | |
def lex_film(lexer): | |
line = getline() | |
if line == '': | |
lexer.emit(BLANK, '') | |
return lex_category | |
elif line is None: # EOF, shut down the machine | |
return None | |
lexer.emit(FILM, title(line)) | |
return lex_names | |
def lex_names(lexer): | |
lexer.emit(NAMES, title(getline())) | |
return lex_film | |
def flatten(categories): | |
for c in categories: | |
for n in c.nominees: | |
yield c.name, n.film, n.names | |
def main(): | |
parser = Parser(Lexer()) | |
w = csv.writer(sys.stdout) | |
w.writerow(['Category', 'Film', 'Name(s)']) | |
w.writerows(flatten(parser.parse())) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment