Created
February 13, 2011 23:46
-
-
Save palfrey/825310 to your computer and use it in GitHub Desktop.
#compoundMovies generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gzip | |
from re import compile | |
from os.path import exists | |
from sys import exit, argv | |
from xml.dom.minidom import parseString | |
import codecs | |
try: | |
import urlgrab | |
except ImportError: | |
print "run 'git clone git://github.com/palfrey/urlgrab.git' and re-run this script" | |
exit(-1) | |
def imdb(): | |
try: | |
items = gzip.open("movies.list.gz") | |
except IOError: | |
print "Please goto http://www.imdb.com/interfaces#plain, goto one of the sites and download a copy of movies.list.gz into this directory" | |
exit(-1) | |
yearPattern = compile("(.*?)(\([\dIV/]+\))") | |
inlist = False | |
for r in items: | |
if not inlist: | |
if r.find("=======")!=-1: | |
inlist = True | |
continue | |
bits = [x.strip() for x in r.split("\t") if len(x.strip())>0] | |
if len(bits)<2: | |
continue | |
movie = bits[0] | |
while True: | |
year = yearPattern.search(movie) | |
if year == None: | |
break | |
movie = year.groups()[0].strip() | |
yield movie | |
def wikipedia(category): | |
baseurl = "http://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:%s&cmsort=timestamp&cmdir=desc&format=xml&cmlimit=max"%category | |
url = baseurl | |
cache = urlgrab.Cache() | |
yearPattern = compile("(.*?)( \((?:\d+ )?film\))") | |
while True: | |
print url | |
data = cache.get(url).read() | |
doc = parseString(data) | |
for element in doc.getElementsByTagName("cm"): | |
title = element.getAttribute("title") | |
if title.startswith("User:"): | |
continue | |
year = yearPattern.search(title) | |
if year!=None: | |
yield year.groups()[0] | |
else: | |
yield title | |
for element in doc.getElementsByTagName("categorymembers"): | |
if element.hasAttribute("cmstart"): | |
start = element.getAttribute("cmstart") | |
url = baseurl + "&cmstart="+start | |
break | |
else: # if we don't otherwise break | |
break | |
isword = compile("([a-zA-Z]{3,})") | |
first = {} | |
last = {} | |
if len(argv)==1: | |
print "Which source to use? imdb or wikipedia" | |
exit(-1) | |
if argv[1] == "imdb": | |
if len(argv)>2: | |
print "imdb accepts no arguments" | |
exit(-1) | |
generate = imdb() | |
elif argv[1] == "wikipedia": | |
if len(argv)!=3: | |
print "wikipedia needs another argument - a category to use" | |
exit(-1) | |
generate = wikipedia(argv[2]) | |
else: | |
print "'%s' isn't wikipedia or imdb"%argv[1] | |
exit(-1) | |
for m in generate: | |
bits = m.split(" ") | |
if len(bits)<2: # skip short names: | |
continue | |
word = isword.match(bits[0]) | |
if word and word.groups()[0] == bits[0]: | |
k = bits[0].lower() | |
if k not in first: | |
first[k] = [] | |
first[k].append(m) | |
word = isword.match(bits[-1]) | |
if word and word.groups()[0] == bits[-1]: | |
k = bits[-1].lower() | |
if k not in last: | |
last[k] = [] | |
last[k.lower()].append(m) | |
#if len(first)>100: | |
# break | |
#print sorted(last.keys()) | |
#print sorted(first.keys()) | |
out = codecs.open("out.txt",encoding="utf-8",mode="wb") | |
for key in last: | |
if key in first: | |
for n in last[key]: | |
initial = " ".join(n.split(" ")[:-1]) | |
for m in first[key]: | |
if m!=n: | |
print >>out, "\"%s\", \"%s\" => \"%s\""%(n, m, initial + " " + m) | |
out.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment