palfrey/compoundMovies.py

## compoundMovies.py
import gzip
from re import compile
from os.path import exists
from sys import exit, argv
from xml.dom.minidom import parseString
import codecs
try:
	import urlgrab
except ImportError:
	print "run 'git clone git://github.com/palfrey/urlgrab.git' and re-run this script"
	exit(-1)

def imdb():
	try:
		items = gzip.open("movies.list.gz")
	except IOError:
		print "Please goto http://www.imdb.com/interfaces#plain, goto one of the sites and download a copy of movies.list.gz into this directory"
		exit(-1)

	yearPattern = compile("(.*?)(\([\dIV/]+\))")

	inlist = False
	for r in items:
		if not inlist:
			if r.find("=======")!=-1:
				inlist = True
			continue

		bits = [x.strip() for x in r.split("\t") if len(x.strip())>0]
		if len(bits)<2:
			continue
		movie = bits[0]
		while True:
			year = yearPattern.search(movie)
			if year == None:
				break
			movie = year.groups()[0].strip()

		yield movie

def wikipedia(category):
	baseurl = "http://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:%s&cmsort=timestamp&cmdir=desc&format=xml&cmlimit=max"%category

	url = baseurl

	cache = urlgrab.Cache()

	yearPattern = compile("(.*?)( \((?:\d+ )?film\))")

	while True:
		print url
		data = cache.get(url).read()
		doc = parseString(data)
		for element in doc.getElementsByTagName("cm"):
			title = element.getAttribute("title")
			if title.startswith("User:"):
				continue
			year = yearPattern.search(title)
			if year!=None:
				yield year.groups()[0]
			else:
				yield title

		for element in doc.getElementsByTagName("categorymembers"):
			if element.hasAttribute("cmstart"):
				start = element.getAttribute("cmstart")
				url = baseurl + "&cmstart="+start
				break
		else: # if we don't otherwise break
			break


isword = compile("([a-zA-Z]{3,})")

first = {}
last = {}

if len(argv)==1:
	print "Which source to use? imdb or wikipedia"
	exit(-1)

if argv[1] == "imdb":
	if len(argv)>2:
		print "imdb accepts no arguments"
		exit(-1)
	generate = imdb()
elif argv[1] == "wikipedia":
	if len(argv)!=3:
		print "wikipedia needs another argument - a category to use"
		exit(-1)
	generate = wikipedia(argv[2])
else:
	print "'%s' isn't wikipedia or imdb"%argv[1]
	exit(-1)

for m in generate:
	bits = m.split(" ")

	if len(bits)<2: # skip short names:
		continue

	word = isword.match(bits[0])
	if word and word.groups()[0] == bits[0]:
		k = bits[0].lower()
		if k not in first:
			first[k] = []
		first[k].append(m)
	word = isword.match(bits[-1])
	if word and word.groups()[0] == bits[-1]:
		k = bits[-1].lower()
		if k not in last:
			last[k] = []
		last[k.lower()].append(m)

	#if len(first)>100:
	#	break

#print sorted(last.keys())
#print sorted(first.keys())

out = codecs.open("out.txt",encoding="utf-8",mode="wb")

for key in last:
	if key in first:
		for n in last[key]:
			initial = " ".join(n.split(" ")[:-1])
			for m in first[key]:
				if m!=n:
					print >>out, "\"%s\", \"%s\" => \"%s\""%(n, m, initial + " " + m)

out.close()
	import gzip
	from re import compile
	from os.path import exists
	from sys import exit, argv
	from xml.dom.minidom import parseString
	import codecs
	try:
	import urlgrab
	except ImportError:
	print "run 'git clone git://github.com/palfrey/urlgrab.git' and re-run this script"
	exit(-1)

	def imdb():
	try:
	items = gzip.open("movies.list.gz")
	except IOError:
	print "Please goto http://www.imdb.com/interfaces#plain, goto one of the sites and download a copy of movies.list.gz into this directory"
	exit(-1)

	yearPattern = compile("(.*?)(\([\dIV/]+\))")

	inlist = False
	for r in items:
	if not inlist:
	if r.find("=======")!=-1:
	inlist = True
	continue

	bits = [x.strip() for x in r.split("\t") if len(x.strip())>0]
	if len(bits)<2:
	continue
	movie = bits[0]
	while True:
	year = yearPattern.search(movie)
	if year == None:
	break
	movie = year.groups()[0].strip()

	yield movie

	def wikipedia(category):
	baseurl = "http://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:%s&cmsort=timestamp&cmdir=desc&format=xml&cmlimit=max"%category

	url = baseurl

	cache = urlgrab.Cache()

	yearPattern = compile("(.*?)( \((?:\d+ )?film\))")

	while True:
	print url
	data = cache.get(url).read()
	doc = parseString(data)
	for element in doc.getElementsByTagName("cm"):
	title = element.getAttribute("title")
	if title.startswith("User:"):
	continue
	year = yearPattern.search(title)
	if year!=None:
	yield year.groups()[0]
	else:
	yield title

	for element in doc.getElementsByTagName("categorymembers"):
	if element.hasAttribute("cmstart"):
	start = element.getAttribute("cmstart")
	url = baseurl + "&cmstart="+start
	break
	else: # if we don't otherwise break
	break



	isword = compile("([a-zA-Z]{3,})")

	first = {}
	last = {}

	if len(argv)==1:
	print "Which source to use? imdb or wikipedia"
	exit(-1)

	if argv[1] == "imdb":
	if len(argv)>2:
	print "imdb accepts no arguments"
	exit(-1)
	generate = imdb()
	elif argv[1] == "wikipedia":
	if len(argv)!=3:
	print "wikipedia needs another argument - a category to use"
	exit(-1)
	generate = wikipedia(argv[2])
	else:
	print "'%s' isn't wikipedia or imdb"%argv[1]
	exit(-1)

	for m in generate:
	bits = m.split(" ")

	if len(bits)<2: # skip short names:
	continue

	word = isword.match(bits[0])
	if word and word.groups()[0] == bits[0]:
	k = bits[0].lower()
	if k not in first:
	first[k] = []
	first[k].append(m)
	word = isword.match(bits[-1])
	if word and word.groups()[0] == bits[-1]:
	k = bits[-1].lower()
	if k not in last:
	last[k] = []
	last[k.lower()].append(m)

	#if len(first)>100:
	# break

	#print sorted(last.keys())
	#print sorted(first.keys())

	out = codecs.open("out.txt",encoding="utf-8",mode="wb")

	for key in last:
	if key in first:
	for n in last[key]:
	initial = " ".join(n.split(" ")[:-1])
	for m in first[key]:
	if m!=n:
	print >>out, "\"%s\", \"%s\" => \"%s\""%(n, m, initial + " " + m)

	out.close()