Skip to content

Instantly share code, notes, and snippets.

@t-redactyl
Created December 18, 2015 01:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save t-redactyl/13cfc08629186e11fb12 to your computer and use it in GitHub Desktop.
Save t-redactyl/13cfc08629186e11fb12 to your computer and use it in GitHub Desktop.
Web scaping code for the blog post: Finding the highest rated Christmas movies in MovieLens 10M (23/12/2015)
import lxml.html
from lxml.cssselect import CSSSelector
import requests
def get_title(node):
'''
Extracts the movie title from the URL http://www.timeout.com/london/film/the-50-best-christmas-movies
taking into account that some titles are tagged as h3, and some as h3 a.
'''
h3_elem = node.cssselect('div.feature-item__text h3')[0]
anchor_elem = h3_elem.cssselect('a')
if len(anchor_elem) == 0:
return h3_elem.text_content()
else:
return anchor_elem[0].text_content()
# Get data and transform to text
r = requests.get("http://www.timeout.com/london/film/the-50-best-christmas-movies")
tree = lxml.html.fromstring(r.text)
# Create selector and apply to text
items_selector = CSSSelector('article.feature-item')
all_items = items_selector(tree)
# Pull titles from text using 'get_title' function.
h3_titles = [get_title(item) for item in all_items[0:50]]
# Strip newline and whitespace from titles
titles = [t.replace('\n', '').strip() for t in h3_titles]
# Convert from unicode and replace apostraphes
titles = [t.encode('utf8').replace('\xe2\x80\x99', '\'') for t in titles]
# Replace titles in the form "The [title]" to "[title], The"
import re
for i, t in enumerate(titles):
if re.match("^The", t):
t = re.sub(r'^The ', '', t)
titles[i] = t[:-7] + ", The" + t[-7:]
# Replace titles in the form "A [title]" to "[title], A"
for i, t in enumerate(titles):
if re.match("^A", t):
t = re.sub(r'^A ', '', t)
titles[i] = t[:-7] + ", A" + t[-7:]
# Change "Joyeux Noël" to just "Joyeux" due to special character matching issues
titles[5] = titles[5].replace('Joyeux No\xc3\xabl (2005)',
'Joyeux')
# Export to text file
f = open("christmas_movies.txt", "w")
f.write("\n".join(map(lambda x: str(x), titles)))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment