Last active
August 29, 2015 14:04
-
-
Save GoSteven/51fd8335136170f7cc63 to your computer and use it in GitHub Desktop.
scrape_abc_movies_review.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from BeautifulSoup import BeautifulSoup | |
import requests | |
import os | |
import re | |
import peewee | |
from peewee import * | |
db = MySQLDatabase('movies',host="localhost",user='root') | |
db.connect() | |
class Movie(peewee.Model): | |
title = peewee.TextField() | |
rater1 = peewee.TextField() | |
star1 = peewee.TextField() | |
rater2 = peewee.TextField() | |
star2 = peewee.TextField() | |
description = peewee.TextField() | |
review_content = peewee.TextField() | |
production_details = peewee.TextField() | |
url = peewee.TextField() | |
class Meta: | |
database = db | |
class Comments(peewee.Model): | |
title = peewee.TextField() | |
rater = peewee.TextField() | |
rating = peewee.TextField() | |
comment = peewee.TextField() | |
class Meta: | |
database = db | |
Movie.drop_table(fail_silently=True) | |
Movie.create_table() | |
Comments.drop_table(fail_silently=True) | |
Comments.create_table() | |
# Movie.delete().where(True).execute() | |
base_url = 'http://www.abc.net.au/atthemovies/review/byyear/{0}.htm' | |
years = range(2014, 2003, -1) | |
DEBUG = False | |
def parse_movie(url): | |
try: | |
r = requests.get(url) | |
html = r.text | |
# save to file | |
# with file('movies/' + url.rsplit('/',1)[1], 'w') as f: | |
# f.write(html.encode('utf-8')) | |
soup = BeautifulSoup(html) | |
videoWrapper = soup.find(id='videoWrapper') | |
movie_title = videoWrapper.find('h1').text | |
if DEBUG: print movie_title | |
score = videoWrapper.find('p', {'class':'score'}) | |
rater1, rater2, dummy = score.text.split(':') | |
star1 = None | |
star2 = None | |
for img in score.findAll('img'): | |
star = img.get('alt').split(' ')[0] | |
if star1 is None: | |
star1 = star | |
else: | |
star2 = star | |
if DEBUG: print rater1, rater2 | |
if DEBUG: print star1, star2 | |
line = ','.join([movie_title, rater1, star1, rater2, star2, url]).encode('utf-8') | |
print line | |
with file('abc_movie_rating.csv','a') as f: | |
f.write(line + '\n'); | |
except Exception as e: | |
print url | |
print e.message | |
with file('error','a') as f: | |
f.write(url + ',' + e.message + '\n') | |
def parse_file(name): | |
url = 'http://www.abc.net.au/atthemovies/txt/' + name | |
with file('movies/' + name, 'r') as f: | |
html = f.read() | |
soup = BeautifulSoup(html.decode('utf-8')) | |
content = soup.find(id='content') | |
# if videoWrapper is None: | |
# videoWrapper = soup.find(id='storyImage') | |
movie_title = content.find('h1').text | |
if DEBUG: print movie_title | |
score = content.find('p', {'class':'score'}) | |
raters = score.text.split(':') | |
try: rater1 = raters[0] | |
except IndexError: rater1 = '' | |
try: rater2 = raters[1] | |
except IndexError: rater2 = '' | |
star1 = '' | |
star2 = '' | |
for img in score.findAll('img'): | |
star = img.get('alt').split(' ')[0] | |
if star1 is '': | |
star1 = star | |
else: | |
star2 = star | |
if DEBUG: print rater1, rater2 | |
if DEBUG: print star1, star2 | |
# line = ','.join([movie_title, rater1, star1, rater2, star2, url]).encode('utf-8') | |
print movie_title, url | |
# with file('abc_movie_rating.csv','a') as ff: | |
# ff.write(line + '\n') | |
movie = Movie(title=movie_title, rater1=rater1, star1=star1, rater2=rater2, star2=star2, url=url) | |
try: | |
description = content.find('p', {'class': 'description'}).text | |
except: | |
description = '' | |
movie.description = description | |
review_content = '' | |
for p in content.findAllNext('p'): | |
if p.get('class', None) == 'description': | |
continue | |
if p.get('class', None) == 'score': | |
continue | |
if p.findNext().name == 'i': | |
break | |
review_content += p.text | |
movie.review_content = review_content | |
production_details = soup.find('p', {'class': 'moviedetails'}).contents | |
movie.production_details = production_details | |
# comments | |
allNewComments = soup.find(id='loadComments') | |
if allNewComments is not None: | |
for li in allNewComments.findAll('li'): | |
comment = Comments() | |
comment.title = movie_title | |
comment.rater = li.find('h3').text | |
comment.comment = li.find('p', {'class': 'comment'}).text | |
m = re.search('\[\[(\d+)\]\]', comment.comment) | |
if m: | |
comment.rating=m.groups(1)[0] | |
else: | |
comment.rating='N/A' | |
comment.save() | |
audiencereview1 = soup.findAll(id='audiencereview1') | |
for aud in audiencereview1: | |
try: | |
comment = Comments() | |
comment.title = movie_title | |
comment.rater = aud.find('p').contents[1].replace(' ', '', 1) | |
comment.rating = aud.find('p').find('img').get('src').rsplit('/',1)[1][:-4] | |
comment.comment = aud.find('p').contents[27].replace(' ', '', 1) | |
comment.save() | |
except Exception as ex: | |
print ex.message | |
print movie.save() | |
def parse_year(url): | |
r = requests.get(url) | |
html = r.text | |
soup = BeautifulSoup(html) | |
content = soup.find(id='content') | |
for h3 in content.findAll(name='h3'): | |
parse_movie(h3.find('a').get('href')) | |
if DEBUG: break | |
# for y in years: | |
# parse_year(base_url.format(str(y))) | |
# if DEBUG: break; | |
# parse_file('s1148397.htm') | |
# parse_file('s3903329.htm') | |
for f in os.listdir('movies'): | |
parse_file(f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment