Parsing a list of 2013 Oscar nominees into CSV. (The raw text is formatted strangely w.r.t. case because it was copy-pasted from a PDF.)
BEST PICTURE | |
Amour | |
TBD, Producer | |
Argo | |
Grant Heslov, Ben Affleck and George Clooney, Producers | |
BeAsts of the southern Wild | |
Dan Janvey, Josh Penn and Michael Gottwald, Producers | |
djAngo unchAined | |
Stacey Sher, Reginald Hudlin and Pilar Savone, Producers | |
les misérABles | |
Tim Bevan, Eric Fellner, Debra Hayward and Cameron Mackintosh, Producers | |
life of Pi | |
Gil Netter, Ang Lee and David Womark, Producers | |
lincoln | |
Steven Spielberg and Kathleen Kennedy, Producers | |
silver linings PlAyBook | |
Donna Gigliotti, Bruce Cohen and Jonathan Gordon, Producers | |
Zero dArk thirty | |
Mark Boal, Kathryn Bigelow and Megan Ellison, Producers | |
ACToR In A LEAdIng RoLE | |
SiLvER LiNiNGS PLAyBooK | |
Bradley cooper | |
LiNCoLN | |
daniel day-lewis | |
LES MiSéRABLES | |
hugh jackman | |
THE MASTER | |
joaquin Phoenix | |
FLiGHT | |
denzel Washington | |
ACToR In A SUPPoRTIng RoLE | |
ARGo | |
Alan Arkin | |
SiLvER LiNiNGS PLAyBooK | |
robert de niro | |
THE MASTER | |
Philip seymour hoffman | |
LiNCoLN | |
tommy lee jones | |
DJANGo UNCHAiNED | |
christoph Waltz | |
ACTRESS In A LEAdIng RoLE | |
ZERo DARK THiRTy | |
jessica chastain | |
SiLvER LiNiNGS PLAyBooK | |
jennifer lawrence | |
AMoUR | |
emmanuelle riva | |
BEASTS oF THE SoUTHERN WiLD | |
Quvenzhané Wallis | |
THE iMPoSSiBLE | |
naomi Watts | |
ACTRESS In A SUPPoRTIng RoLE | |
THE MASTER | |
Amy Adams | |
LiNCoLN | |
sally field | |
LES MiSéRABLES | |
Anne hathaway | |
THE SESSioNS | |
helen hunt | |
SiLvER LiNiNGS PLAyBooK | |
jacki Weaver | |
AnImATEd FEATURE FILm | |
BrAve | |
Mark Andrews and Brenda Chapman | |
frAnkenWeenie | |
Tim Burton | |
PArAnormAn | |
Sam Fell and Chris Butler | |
the PirAtes! BAnd of misfits | |
Peter Lord | |
Wreck-it rAlPh | |
Rich Moore | |
CInEmATogRAPHY | |
AnnA kAreninA | |
Seamus McGarvey | |
djAngo unchAined | |
Robert Richardson | |
life of Pi | |
Claudio Miranda | |
lincoln | |
Janusz Kaminski | |
skyfAll | |
Roger Deakins | |
CoSTUmE dESIgn | |
AnnA kAreninA | |
Jacqueline Durran | |
les misérABles | |
Paco Delgado | |
lincoln | |
Joanna Johnston | |
mirror mirror | |
Eiko ishioka | |
snoW White And the huntsmAn | |
Colleen Atwood | |
dIRECTIng | |
Amour | |
Michael Haneke | |
BeAsts of the southern Wild | |
Benh Zeitlin | |
life of Pi | |
Ang Lee | |
lincoln | |
Steven Spielberg | |
silver linings PlAyBook | |
David o. Russell | |
doCUmEnTARY FEATURE | |
5 Broken cAmerAs | |
Emad Burnat and Guy Davidi | |
the gAtekeePers | |
TBD | |
hoW to survive A PlAgue | |
TBD | |
the invisiBle WAr | |
TBD | |
seArching for sugAr mAn | |
TBD | |
doCUmEnTARY SHoRT SUBjECT | |
inocente | |
Sean Fine and Andrea Nix Fine | |
kings Point | |
Sari Gilman and Jedd Wider | |
mondAys At rAcine | |
Cynthia Wade and Robin Honan | |
oPen heArt | |
Kief Davidson and Cori Shepherd Stern | |
redemPtion | |
Jon Alpert and Matthew o’Neill | |
FILm EdITIng | |
Argo | |
William Goldenberg | |
life of Pi | |
Tim Squyres | |
lincoln | |
Michael Kahn | |
silver linings PlAyBook | |
Jay Cassidy and Crispin Struthers | |
Zero dArk thirty | |
Dylan Tichenor and William Goldenberg | |
FoREIgn LAngUAgE FILm | |
Amour | |
Austria | |
kon-tiki | |
Norway | |
no | |
Chile | |
A royAl AffAir | |
Denmark | |
WAr Witch | |
Canada | |
mAKEUP And HAIRSTYLIng | |
hitchcock | |
Howard Berger, Peter Montagna and Martin Samuel | |
the hoBBit: An unexPected journey | |
Peter Swords King, Rick Findlater and Tami Lane | |
les misérABles | |
Lisa Westcott and Julie Dartnell | |
mUSIC (oRIgInAL SCoRE) | |
AnnA kAreninA | |
Dario Marianelli | |
Argo | |
Alexandre Desplat | |
life of Pi | |
Mychael Danna | |
lincoln | |
John Williams | |
skyfAll | |
Thomas Newman | |
mUSIC (oRIgInAL Song) | |
CHASiNG iCE | |
“Before my time”, Music and Lyric by J. Ralph | |
TED | |
“Everybody needs A Best friend”, Music by Walter Murphy, Lyric by Seth MacFarlane | |
LiFE oF Pi | |
“Pi’s lullaby”, Music by Mychael Danna, Lyric by Bombay Jayashri | |
SKyFALL | |
“skyfall”, Music and Lyric by Adele Adkins and Paul Epworth | |
LES MiSéRABLES | |
“suddenly”, Music by Claude-Michel Schönberg, Lyric by Herbert Kretzmer and Alain Boublil | |
PRodUCTIon dESIgn | |
AnnA kAreninA | |
Production Design: Sarah Greenwood, Set Decoration: Katie Spencer | |
the hoBBit: An unexPected journey | |
Production Design: Dan Hennah, Set Decoration: Ra vincent and Simon Bright | |
les misérABles | |
Production Design: Eve Stewart, Set Decoration: Anna Lynch-Robinson | |
life of Pi | |
Production Design: David Gropman, Set Decoration: Anna Pinnock | |
lincoln | |
Production Design: Rick Carter, Set Decoration: Jim Erickson | |
SHoRT FILm (AnImATEd) | |
AdAm And dog | |
Minkyu Lee | |
fresh guAcAmole | |
PES | |
heAd over heels | |
Timothy Reckart and Fodhla Cronin o’Reilly | |
mAggie simPson in “the longest dAycAre” | |
David Silverman | |
PAPermAn | |
John Kahrs | |
SHoRT FILm (LIvE ACTIon) | |
AsAd | |
Bryan Buckley and Mino Jarjoura | |
BuZkAshi Boys | |
Sam French and Ariel Nasr | |
curfeW | |
Shawn Christensen | |
deAth of A shAdoW (dood vAn een schAduW) | |
Tom van Avermaet and Ellen De Waele | |
henry | |
yan England | |
SoUnd EdITIng | |
Argo | |
Erik Aadahl and Ethan van der Ryn | |
djAngo unchAined | |
Wylie Stateman | |
life of Pi | |
Eugene Gearty and Philip Stockton | |
skyfAll | |
Per Hallberg and Karen Baker Landers | |
Zero dArk thirty | |
Paul N.J. ottosson | |
SoUnd mIXIng | |
Argo | |
John Reitz, Gregg Rudloff and Jose Antonio Garcia | |
les misérABles | |
Andy Nelson, Mark Paterson and Simon Hayes | |
life of Pi | |
Ron Bartlett, D.M. Hemphill and Drew Kunin | |
lincoln | |
Andy Nelson, Gary Rydstrom and Ronald Judkins | |
skyfAll | |
Scott Millan, Greg P. Russell and Stuart Wilson | |
vISUAL EFFECTS | |
the hoBBit: An unexPected journey | |
Joe Letteri, Eric Saindon, David Clayton and R. Christopher White | |
life of Pi | |
Bill Westenhofer, Guillaume Rocheron, Erik-Jan De Boer and Donald R. Elliott | |
mArvel’s the Avengers | |
Janek Sirrs, Jeff White, Guy Williams and Dan Sudick | |
Prometheus | |
Richard Stammers, Trevor Wood, Charley Henley and Martin Hill | |
snoW White And the huntsmAn | |
Cedric Nicolas-Troyan, Philip Brennan, Neil Corbould and Michael Dawson | |
WRITIng (AdAPTEd SCREEnPLAY) | |
Argo | |
Screenplay by Chris Terrio | |
BeAsts of the southern Wild | |
Screenplay by Lucy Alibar & Benh Zeitlin | |
life of Pi | |
Screenplay by David Magee | |
lincoln | |
Screenplay by Tony Kushner | |
silver linings PlAyBook | |
Screenplay by David o. Russell | |
WRITIng (oRIgInAL SCREEnPLAY) | |
Amour | |
Written by Michael Haneke | |
djAngo unchAined | |
Written by Quentin Tarantino | |
flight | |
Written by John Gatins | |
moonrise kingdom | |
Written by Wes Anderson & Roman Coppola | |
Zero dArk thirty | |
Written by Mark Boal |
# -*- coding: utf-8 -*- | |
""" | |
Scans and parses a very specific form of input, text converted from a PDF list | |
of nominees from the Academy Awards website, into a CSV file | |
""" | |
import csv | |
import sys | |
from collections import namedtuple | |
# token types. | |
CATEGORY, FILM, NAMES, BLANK = range(4) | |
Nominee = namedtuple('Nominee', ('film', 'names')) | |
class Category: | |
def __init__(self, name): | |
self.name = name | |
self.nominees = [] | |
class Lexer: | |
def __init__(self): | |
self.token = None | |
def run(self, start_state): | |
state = start_state | |
while state is not None: | |
if self.token is not None: | |
yield self.token | |
state = state(self) | |
def emit(self, toktype, value): | |
self.token = (toktype, value) | |
class Parser: | |
def __init__(self, lexer): | |
self.lexer = lexer | |
self.cat = None | |
self.film = None | |
def reset(self): | |
self.cat = None | |
self.film = None | |
def parse(self): | |
categories = [] | |
for toktype, value in self.lexer.run(lex_category): | |
if toktype == CATEGORY: | |
self.cat = Category(value) | |
elif toktype == FILM: | |
self.film = value | |
elif toktype == NAMES: | |
self.cat.nominees.append(Nominee(self.film, value)) | |
elif toktype == BLANK: | |
categories.append(self.cat) | |
self.reset() | |
if self.cat is not None: | |
categories.append(self.cat) | |
return categories | |
def getline(): | |
line = sys.stdin.readline() | |
if line == '': | |
return None | |
return line.strip() | |
def title(s): | |
return s.title() | |
def lex_category(lexer): | |
lexer.emit(CATEGORY, title(getline())) | |
return lex_film | |
def lex_film(lexer): | |
line = getline() | |
if line == '': | |
lexer.emit(BLANK, '') | |
return lex_category | |
elif line is None: # EOF, shut down the machine | |
return None | |
lexer.emit(FILM, title(line)) | |
return lex_names | |
def lex_names(lexer): | |
lexer.emit(NAMES, title(getline())) | |
return lex_film | |
def flatten(categories): | |
for c in categories: | |
for n in c.nominees: | |
yield c.name, n.film, n.names | |
def main(): | |
parser = Parser(Lexer()) | |
w = csv.writer(sys.stdout) | |
w.writerow(['Category', 'Film', 'Name(s)']) | |
w.writerows(flatten(parser.parse())) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment