Created
December 18, 2015 01:02
-
-
Save t-redactyl/13cfc08629186e11fb12 to your computer and use it in GitHub Desktop.
Web scaping code for the blog post: Finding the highest rated Christmas movies in MovieLens 10M (23/12/2015)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lxml.html | |
from lxml.cssselect import CSSSelector | |
import requests | |
def get_title(node): | |
''' | |
Extracts the movie title from the URL http://www.timeout.com/london/film/the-50-best-christmas-movies | |
taking into account that some titles are tagged as h3, and some as h3 a. | |
''' | |
h3_elem = node.cssselect('div.feature-item__text h3')[0] | |
anchor_elem = h3_elem.cssselect('a') | |
if len(anchor_elem) == 0: | |
return h3_elem.text_content() | |
else: | |
return anchor_elem[0].text_content() | |
# Get data and transform to text | |
r = requests.get("http://www.timeout.com/london/film/the-50-best-christmas-movies") | |
tree = lxml.html.fromstring(r.text) | |
# Create selector and apply to text | |
items_selector = CSSSelector('article.feature-item') | |
all_items = items_selector(tree) | |
# Pull titles from text using 'get_title' function. | |
h3_titles = [get_title(item) for item in all_items[0:50]] | |
# Strip newline and whitespace from titles | |
titles = [t.replace('\n', '').strip() for t in h3_titles] | |
# Convert from unicode and replace apostraphes | |
titles = [t.encode('utf8').replace('\xe2\x80\x99', '\'') for t in titles] | |
# Replace titles in the form "The [title]" to "[title], The" | |
import re | |
for i, t in enumerate(titles): | |
if re.match("^The", t): | |
t = re.sub(r'^The ', '', t) | |
titles[i] = t[:-7] + ", The" + t[-7:] | |
# Replace titles in the form "A [title]" to "[title], A" | |
for i, t in enumerate(titles): | |
if re.match("^A", t): | |
t = re.sub(r'^A ', '', t) | |
titles[i] = t[:-7] + ", A" + t[-7:] | |
# Change "Joyeux Noël" to just "Joyeux" due to special character matching issues | |
titles[5] = titles[5].replace('Joyeux No\xc3\xabl (2005)', | |
'Joyeux') | |
# Export to text file | |
f = open("christmas_movies.txt", "w") | |
f.write("\n".join(map(lambda x: str(x), titles))) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment