Skip to content

Instantly share code, notes, and snippets.

@aliostad
Created June 13, 2017 08:14
Show Gist options
  • Save aliostad/3f91e55376c53489608f4dc8050ee71e to your computer and use it in GitHub Desktop.
Save aliostad/3f91e55376c53489608f4dc8050ee71e to your computer and use it in GitHub Desktop.
A simple script to find Amazon prime (UK) films and their rotten tomato score
from bs4 import BeautifulSoup
import codecs
import requests
def get_film_names(doc):
res = []
bs = BeautifulSoup(doc, 'html.parser')
for a in bs.select('ul li .s-item-container h2'):
name = a.text.replace('\t', ' ')
yearTags = a.parent.parent.select('> span.a-color-secondary')
if len(yearTags) > 0:
name = name + '\t' + yearTags[0].text
res.append(name)
return res
def get_rotten_tomatoes_scorex(filmName):
r = requests.get('https://www.rottentomatoes.com/m/' + filmName, headers={'User-Agent': 'Mozilla/5.0 (Mobile; rv:26.0) Gecko/26.0 Firefox/26.'})
html = r.text
bs = BeautifulSoup(html, 'html.parser')
spans = bs.select('#all-critics-numbers span.meter-value span')
if len(spans) > 0:
return spans[0].text
else:
return None
def get_rotten_tomatoes_score(filmName, filmYear):
filmName = filmName.replace(' ', '_').replace(':', '').replace("'", '')
score = get_rotten_tomatoes_scorex(filmName)
if score is None:
score = get_rotten_tomatoes_scorex(filmName + '_' + filmYear)
return score
def iterate(frompg=1, pgcount=1000):
f = codecs.open('amaz-films.txt', mode='w', encoding='utf-8')
films = {}
templ = 'https://www.amazon.co.uk/s/ref=sr_pg_{}?fst=as%3Aoff&rh=n%3A3010085031%2Cn%3A%213010086031%2Cn%3A3046737031%2Cp_85%3A3282143031&bbn=3046737031&ie=UTF8&qid=1497108228&page={}'
for pg in range(frompg, pgcount):
#print pg
url = templ.format(pg, pg)
print url
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Mobile; rv:26.0) Gecko/26.0 Firefox/26.'})
html = r.text
for film in get_film_names(html):
films[film] = 100 #get_rotten_tomatoes_score(film)
f.write(film + '\n')
f.close()
exit(0)
sortedFilms = sorted(films, key=lambda f: films[f], reverse=True)
for film in sortedFilms:
txt = '{} - {}'.format(films[film], film)
print txt
f.write(txt + '\n')
f.close()
def find_score(fileName='amaz-films.txt'):
f = codecs.open(filename=fileName, mode='r', encoding='utf-8')
fr = codecs.open(filename=fileName + '.score', mode='w', encoding='utf-8')
films = {}
for line in f:
try:
line = line.replace('\n', '')
if len(line.replace('\t', '')) < len(line):
filmName, filmYear = line.split('\t')
else:
fileName = line
filmYear = '2010'
score = get_rotten_tomatoes_score(filmName, filmYear)
if score is not None:
films[fileName] = score
print '{} => {}'.format(filmName, score)
except Exception as e:
print e
sortedFilms = sorted(films, key=lambda f: films[f], reverse=True)
for film in sortedFilms:
txt = '{} - {}'.format(films[film], film)
print txt
fr.write(txt + '\n')
# finds amazon films and store in a file
iterate(1, 400)
# finds score for those films loaded from the file
find_score()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment