t-redactyl/web-scraping.py

## web-scraping.py
import lxml.html
from lxml.cssselect import CSSSelector
import requests

def get_title(node):
    '''
    Extracts the movie title from the URL http://www.timeout.com/london/film/the-50-best-christmas-movies
    taking into account that some titles are tagged as h3, and some as h3 a.
    '''
    h3_elem = node.cssselect('div.feature-item__text h3')[0]
    anchor_elem = h3_elem.cssselect('a')
    if len(anchor_elem) == 0:
        return h3_elem.text_content()
    else:
        return anchor_elem[0].text_content()

# Get data and transform to text
r = requests.get("http://www.timeout.com/london/film/the-50-best-christmas-movies")
tree = lxml.html.fromstring(r.text)

# Create selector and apply to text
items_selector = CSSSelector('article.feature-item')
all_items = items_selector(tree)

# Pull titles from text using 'get_title' function.
h3_titles = [get_title(item) for item in all_items[0:50]]

# Strip newline and whitespace from titles
titles = [t.replace('\n', '').strip() for t in h3_titles]

# Convert from unicode and replace apostraphes
titles = [t.encode('utf8').replace('\xe2\x80\x99', '\'') for t in titles]

# Replace titles in the form "The [title]" to "[title], The"
import re
for i, t in enumerate(titles):
    if re.match("^The", t):
        t = re.sub(r'^The ', '', t)
        titles[i] = t[:-7] + ", The" + t[-7:]

# Replace titles in the form "A [title]" to "[title], A"
for i, t in enumerate(titles):
    if re.match("^A", t):
        t = re.sub(r'^A ', '', t)
        titles[i] = t[:-7] + ", A" + t[-7:]

# Change "Joyeux Noël" to just "Joyeux" due to special character matching issues
titles[5] = titles[5].replace('Joyeux No\xc3\xabl (2005)',
                              'Joyeux')

# Export to text file
f = open("christmas_movies.txt", "w")
f.write("\n".join(map(lambda x: str(x), titles)))
f.close()
	import lxml.html
	from lxml.cssselect import CSSSelector
	import requests

	def get_title(node):
	'''
	Extracts the movie title from the URL http://www.timeout.com/london/film/the-50-best-christmas-movies
	taking into account that some titles are tagged as h3, and some as h3 a.
	'''
	h3_elem = node.cssselect('div.feature-item__text h3')[0]
	anchor_elem = h3_elem.cssselect('a')
	if len(anchor_elem) == 0:
	return h3_elem.text_content()
	else:
	return anchor_elem[0].text_content()

	# Get data and transform to text
	r = requests.get("http://www.timeout.com/london/film/the-50-best-christmas-movies")
	tree = lxml.html.fromstring(r.text)

	# Create selector and apply to text
	items_selector = CSSSelector('article.feature-item')
	all_items = items_selector(tree)

	# Pull titles from text using 'get_title' function.
	h3_titles = [get_title(item) for item in all_items[0:50]]

	# Strip newline and whitespace from titles
	titles = [t.replace('\n', '').strip() for t in h3_titles]

	# Convert from unicode and replace apostraphes
	titles = [t.encode('utf8').replace('\xe2\x80\x99', '\'') for t in titles]

	# Replace titles in the form "The [title]" to "[title], The"
	import re
	for i, t in enumerate(titles):
	if re.match("^The", t):
	t = re.sub(r'^The ', '', t)
	titles[i] = t[:-7] + ", The" + t[-7:]

	# Replace titles in the form "A [title]" to "[title], A"
	for i, t in enumerate(titles):
	if re.match("^A", t):
	t = re.sub(r'^A ', '', t)
	titles[i] = t[:-7] + ", A" + t[-7:]

	# Change "Joyeux Noël" to just "Joyeux" due to special character matching issues
	titles[5] = titles[5].replace('Joyeux No\xc3\xabl (2005)',
	'Joyeux')

	# Export to text file
	f = open("christmas_movies.txt", "w")
	f.write("\n".join(map(lambda x: str(x), titles)))
	f.close()