Skip to content

Instantly share code, notes, and snippets.

@GoSteven
Last active August 29, 2015 14:04
Show Gist options
  • Save GoSteven/51fd8335136170f7cc63 to your computer and use it in GitHub Desktop.
Save GoSteven/51fd8335136170f7cc63 to your computer and use it in GitHub Desktop.
scrape_abc_movies_review.py
from BeautifulSoup import BeautifulSoup
import requests
import os
import re
import peewee
from peewee import *
db = MySQLDatabase('movies',host="localhost",user='root')
db.connect()
class Movie(peewee.Model):
title = peewee.TextField()
rater1 = peewee.TextField()
star1 = peewee.TextField()
rater2 = peewee.TextField()
star2 = peewee.TextField()
description = peewee.TextField()
review_content = peewee.TextField()
production_details = peewee.TextField()
url = peewee.TextField()
class Meta:
database = db
class Comments(peewee.Model):
title = peewee.TextField()
rater = peewee.TextField()
rating = peewee.TextField()
comment = peewee.TextField()
class Meta:
database = db
Movie.drop_table(fail_silently=True)
Movie.create_table()
Comments.drop_table(fail_silently=True)
Comments.create_table()
# Movie.delete().where(True).execute()
base_url = 'http://www.abc.net.au/atthemovies/review/byyear/{0}.htm'
years = range(2014, 2003, -1)
DEBUG = False
def parse_movie(url):
try:
r = requests.get(url)
html = r.text
# save to file
# with file('movies/' + url.rsplit('/',1)[1], 'w') as f:
# f.write(html.encode('utf-8'))
soup = BeautifulSoup(html)
videoWrapper = soup.find(id='videoWrapper')
movie_title = videoWrapper.find('h1').text
if DEBUG: print movie_title
score = videoWrapper.find('p', {'class':'score'})
rater1, rater2, dummy = score.text.split(':')
star1 = None
star2 = None
for img in score.findAll('img'):
star = img.get('alt').split(' ')[0]
if star1 is None:
star1 = star
else:
star2 = star
if DEBUG: print rater1, rater2
if DEBUG: print star1, star2
line = ','.join([movie_title, rater1, star1, rater2, star2, url]).encode('utf-8')
print line
with file('abc_movie_rating.csv','a') as f:
f.write(line + '\n');
except Exception as e:
print url
print e.message
with file('error','a') as f:
f.write(url + ',' + e.message + '\n')
def parse_file(name):
url = 'http://www.abc.net.au/atthemovies/txt/' + name
with file('movies/' + name, 'r') as f:
html = f.read()
soup = BeautifulSoup(html.decode('utf-8'))
content = soup.find(id='content')
# if videoWrapper is None:
# videoWrapper = soup.find(id='storyImage')
movie_title = content.find('h1').text
if DEBUG: print movie_title
score = content.find('p', {'class':'score'})
raters = score.text.split(':')
try: rater1 = raters[0]
except IndexError: rater1 = ''
try: rater2 = raters[1]
except IndexError: rater2 = ''
star1 = ''
star2 = ''
for img in score.findAll('img'):
star = img.get('alt').split(' ')[0]
if star1 is '':
star1 = star
else:
star2 = star
if DEBUG: print rater1, rater2
if DEBUG: print star1, star2
# line = ','.join([movie_title, rater1, star1, rater2, star2, url]).encode('utf-8')
print movie_title, url
# with file('abc_movie_rating.csv','a') as ff:
# ff.write(line + '\n')
movie = Movie(title=movie_title, rater1=rater1, star1=star1, rater2=rater2, star2=star2, url=url)
try:
description = content.find('p', {'class': 'description'}).text
except:
description = ''
movie.description = description
review_content = ''
for p in content.findAllNext('p'):
if p.get('class', None) == 'description':
continue
if p.get('class', None) == 'score':
continue
if p.findNext().name == 'i':
break
review_content += p.text
movie.review_content = review_content
production_details = soup.find('p', {'class': 'moviedetails'}).contents
movie.production_details = production_details
# comments
allNewComments = soup.find(id='loadComments')
if allNewComments is not None:
for li in allNewComments.findAll('li'):
comment = Comments()
comment.title = movie_title
comment.rater = li.find('h3').text
comment.comment = li.find('p', {'class': 'comment'}).text
m = re.search('\[\[(\d+)\]\]', comment.comment)
if m:
comment.rating=m.groups(1)[0]
else:
comment.rating='N/A'
comment.save()
audiencereview1 = soup.findAll(id='audiencereview1')
for aud in audiencereview1:
try:
comment = Comments()
comment.title = movie_title
comment.rater = aud.find('p').contents[1].replace(' ', '', 1)
comment.rating = aud.find('p').find('img').get('src').rsplit('/',1)[1][:-4]
comment.comment = aud.find('p').contents[27].replace(' ', '', 1)
comment.save()
except Exception as ex:
print ex.message
print movie.save()
def parse_year(url):
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html)
content = soup.find(id='content')
for h3 in content.findAll(name='h3'):
parse_movie(h3.find('a').get('href'))
if DEBUG: break
# for y in years:
# parse_year(base_url.format(str(y)))
# if DEBUG: break;
# parse_file('s1148397.htm')
# parse_file('s3903329.htm')
for f in os.listdir('movies'):
parse_file(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment