Skip to content

Instantly share code, notes, and snippets.

@palfrey
Created February 13, 2011 23:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save palfrey/825310 to your computer and use it in GitHub Desktop.
Save palfrey/825310 to your computer and use it in GitHub Desktop.
#compoundMovies generator
import gzip
from re import compile
from os.path import exists
from sys import exit, argv
from xml.dom.minidom import parseString
import codecs
try:
import urlgrab
except ImportError:
print "run 'git clone git://github.com/palfrey/urlgrab.git' and re-run this script"
exit(-1)
def imdb():
try:
items = gzip.open("movies.list.gz")
except IOError:
print "Please goto http://www.imdb.com/interfaces#plain, goto one of the sites and download a copy of movies.list.gz into this directory"
exit(-1)
yearPattern = compile("(.*?)(\([\dIV/]+\))")
inlist = False
for r in items:
if not inlist:
if r.find("=======")!=-1:
inlist = True
continue
bits = [x.strip() for x in r.split("\t") if len(x.strip())>0]
if len(bits)<2:
continue
movie = bits[0]
while True:
year = yearPattern.search(movie)
if year == None:
break
movie = year.groups()[0].strip()
yield movie
def wikipedia(category):
baseurl = "http://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:%s&cmsort=timestamp&cmdir=desc&format=xml&cmlimit=max"%category
url = baseurl
cache = urlgrab.Cache()
yearPattern = compile("(.*?)( \((?:\d+ )?film\))")
while True:
print url
data = cache.get(url).read()
doc = parseString(data)
for element in doc.getElementsByTagName("cm"):
title = element.getAttribute("title")
if title.startswith("User:"):
continue
year = yearPattern.search(title)
if year!=None:
yield year.groups()[0]
else:
yield title
for element in doc.getElementsByTagName("categorymembers"):
if element.hasAttribute("cmstart"):
start = element.getAttribute("cmstart")
url = baseurl + "&cmstart="+start
break
else: # if we don't otherwise break
break
isword = compile("([a-zA-Z]{3,})")
first = {}
last = {}
if len(argv)==1:
print "Which source to use? imdb or wikipedia"
exit(-1)
if argv[1] == "imdb":
if len(argv)>2:
print "imdb accepts no arguments"
exit(-1)
generate = imdb()
elif argv[1] == "wikipedia":
if len(argv)!=3:
print "wikipedia needs another argument - a category to use"
exit(-1)
generate = wikipedia(argv[2])
else:
print "'%s' isn't wikipedia or imdb"%argv[1]
exit(-1)
for m in generate:
bits = m.split(" ")
if len(bits)<2: # skip short names:
continue
word = isword.match(bits[0])
if word and word.groups()[0] == bits[0]:
k = bits[0].lower()
if k not in first:
first[k] = []
first[k].append(m)
word = isword.match(bits[-1])
if word and word.groups()[0] == bits[-1]:
k = bits[-1].lower()
if k not in last:
last[k] = []
last[k.lower()].append(m)
#if len(first)>100:
# break
#print sorted(last.keys())
#print sorted(first.keys())
out = codecs.open("out.txt",encoding="utf-8",mode="wb")
for key in last:
if key in first:
for n in last[key]:
initial = " ".join(n.split(" ")[:-1])
for m in first[key]:
if m!=n:
print >>out, "\"%s\", \"%s\" => \"%s\""%(n, m, initial + " " + m)
out.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment