Skip to content

Instantly share code, notes, and snippets.

@bllchmbrs
Created February 17, 2016 21:26
Show Gist options
  • Save bllchmbrs/834945335f3deaa3b5d0 to your computer and use it in GitHub Desktop.
Save bllchmbrs/834945335f3deaa3b5d0 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import re
import glob
def get_prod(soup):
production_companies = []
for row in soup.select("tr"):
for th in row.select("th"):
if th.text.strip() == "Production\ncompany":
for comp in row.select("td a"):
production_companies.append(comp.text)
return production_companies
for name in sorted(glob.glob("data/wiki/movies/*")):
with open(name) as f:
soup = BeautifulSoup(''.join(f.readlines()), 'lxml')
movie_id = "/wiki/" + re.findall(r"([\d_\w\(\)%,]*)\.html", name)[0]
production_companies = get_prod(soup)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment