aliostad/amazon_rotten_tomatoes_score.py

## amazon_rotten_tomatoes_score.py
from bs4 import BeautifulSoup
import codecs
import requests

def get_film_names(doc):
  res = []
  bs = BeautifulSoup(doc, 'html.parser')
  for a in bs.select('ul li .s-item-container h2'):
    name = a.text.replace('\t', ' ')
    yearTags = a.parent.parent.select('> span.a-color-secondary')
    if len(yearTags) > 0:
      name = name + '\t' + yearTags[0].text
    res.append(name)
  return res

def get_rotten_tomatoes_scorex(filmName):
  r = requests.get('https://www.rottentomatoes.com/m/' + filmName, headers={'User-Agent': 'Mozilla/5.0 (Mobile; rv:26.0) Gecko/26.0 Firefox/26.'})
  html = r.text
  bs = BeautifulSoup(html, 'html.parser')
  spans = bs.select('#all-critics-numbers span.meter-value span')
  if len(spans) > 0:
    return spans[0].text
  else:
    return None

def get_rotten_tomatoes_score(filmName, filmYear):
  filmName = filmName.replace(' ', '_').replace(':', '').replace("'", '')
  score = get_rotten_tomatoes_scorex(filmName)
  if score is None:
    score = get_rotten_tomatoes_scorex(filmName + '_' + filmYear)
  return score

def iterate(frompg=1, pgcount=1000):
  f = codecs.open('amaz-films.txt', mode='w', encoding='utf-8')
  films = {}
  templ = 'https://www.amazon.co.uk/s/ref=sr_pg_{}?fst=as%3Aoff&rh=n%3A3010085031%2Cn%3A%213010086031%2Cn%3A3046737031%2Cp_85%3A3282143031&bbn=3046737031&ie=UTF8&qid=1497108228&page={}'
  for pg in range(frompg, pgcount):
    #print pg
    url = templ.format(pg, pg)
    print url
    r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Mobile; rv:26.0) Gecko/26.0 Firefox/26.'})
    html = r.text
    for film in get_film_names(html):
      films[film] = 100 #get_rotten_tomatoes_score(film)
      f.write(film + '\n')
  f.close()
  exit(0)
  sortedFilms = sorted(films, key=lambda f: films[f], reverse=True)
  for film in sortedFilms:
    txt = '{} - {}'.format(films[film], film)
    print txt
    f.write(txt + '\n')
  f.close()

def find_score(fileName='amaz-films.txt'):
  f = codecs.open(filename=fileName, mode='r', encoding='utf-8')
  fr = codecs.open(filename=fileName + '.score', mode='w', encoding='utf-8')
  films = {}
  for line in f:
    try:
      line = line.replace('\n', '')
      if len(line.replace('\t', '')) < len(line):
        filmName, filmYear = line.split('\t')
      else:
        fileName = line
        filmYear = '2010'
      score = get_rotten_tomatoes_score(filmName, filmYear)
      if score is not None:
        films[fileName] = score
        print '{} => {}'.format(filmName, score)
    except Exception as e:
      print e
  sortedFilms = sorted(films, key=lambda f: films[f], reverse=True)
  for film in sortedFilms:
    txt = '{} - {}'.format(films[film], film)
    print txt
    fr.write(txt + '\n')

# finds amazon films and store in a file
iterate(1, 400)

# finds score for those films loaded from the file
find_score()
	from bs4 import BeautifulSoup
	import codecs
	import requests

	def get_film_names(doc):
	res = []
	bs = BeautifulSoup(doc, 'html.parser')
	for a in bs.select('ul li .s-item-container h2'):
	name = a.text.replace('\t', ' ')
	yearTags = a.parent.parent.select('> span.a-color-secondary')
	if len(yearTags) > 0:
	name = name + '\t' + yearTags[0].text
	res.append(name)
	return res

	def get_rotten_tomatoes_scorex(filmName):
	r = requests.get('https://www.rottentomatoes.com/m/' + filmName, headers={'User-Agent': 'Mozilla/5.0 (Mobile; rv:26.0) Gecko/26.0 Firefox/26.'})
	html = r.text
	bs = BeautifulSoup(html, 'html.parser')
	spans = bs.select('#all-critics-numbers span.meter-value span')
	if len(spans) > 0:
	return spans[0].text
	else:
	return None

	def get_rotten_tomatoes_score(filmName, filmYear):
	filmName = filmName.replace(' ', '_').replace(':', '').replace("'", '')
	score = get_rotten_tomatoes_scorex(filmName)
	if score is None:
	score = get_rotten_tomatoes_scorex(filmName + '_' + filmYear)
	return score

	def iterate(frompg=1, pgcount=1000):
	f = codecs.open('amaz-films.txt', mode='w', encoding='utf-8')
	films = {}
	templ = 'https://www.amazon.co.uk/s/ref=sr_pg_{}?fst=as%3Aoff&rh=n%3A3010085031%2Cn%3A%213010086031%2Cn%3A3046737031%2Cp_85%3A3282143031&bbn=3046737031&ie=UTF8&qid=1497108228&page={}'
	for pg in range(frompg, pgcount):
	#print pg
	url = templ.format(pg, pg)
	print url
	r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Mobile; rv:26.0) Gecko/26.0 Firefox/26.'})
	html = r.text
	for film in get_film_names(html):
	films[film] = 100 #get_rotten_tomatoes_score(film)
	f.write(film + '\n')
	f.close()
	exit(0)
	sortedFilms = sorted(films, key=lambda f: films[f], reverse=True)
	for film in sortedFilms:
	txt = '{} - {}'.format(films[film], film)
	print txt
	f.write(txt + '\n')
	f.close()

	def find_score(fileName='amaz-films.txt'):
	f = codecs.open(filename=fileName, mode='r', encoding='utf-8')
	fr = codecs.open(filename=fileName + '.score', mode='w', encoding='utf-8')
	films = {}
	for line in f:
	try:
	line = line.replace('\n', '')
	if len(line.replace('\t', '')) < len(line):
	filmName, filmYear = line.split('\t')
	else:
	fileName = line
	filmYear = '2010'
	score = get_rotten_tomatoes_score(filmName, filmYear)
	if score is not None:
	films[fileName] = score
	print '{} => {}'.format(filmName, score)
	except Exception as e:
	print e
	sortedFilms = sorted(films, key=lambda f: films[f], reverse=True)
	for film in sortedFilms:
	txt = '{} - {}'.format(films[film], film)
	print txt
	fr.write(txt + '\n')

	# finds amazon films and store in a file
	iterate(1, 400)

	# finds score for those films loaded from the file
	find_score()