GoSteven/scrape_abc_movies_review.py

## scrape_abc_movies_review.py
from BeautifulSoup import BeautifulSoup
import requests
import os
import re
import peewee
from peewee import *
db = MySQLDatabase('movies',host="localhost",user='root')

db.connect()
class Movie(peewee.Model):
    title = peewee.TextField()
    rater1 = peewee.TextField()
    star1 = peewee.TextField()
    rater2 = peewee.TextField()
    star2 = peewee.TextField()
    description = peewee.TextField()
    review_content = peewee.TextField()
    production_details = peewee.TextField()
    url = peewee.TextField()

    class Meta:
        database = db

class Comments(peewee.Model):
    title = peewee.TextField()
    rater = peewee.TextField()
    rating = peewee.TextField()
    comment = peewee.TextField()

    class Meta:
        database = db


Movie.drop_table(fail_silently=True)
Movie.create_table()

Comments.drop_table(fail_silently=True)
Comments.create_table()
# Movie.delete().where(True).execute()

base_url = 'http://www.abc.net.au/atthemovies/review/byyear/{0}.htm'
years = range(2014, 2003, -1)
DEBUG = False

def parse_movie(url):
    try:
        r = requests.get(url)
        html = r.text
        # save to file
        # with file('movies/' + url.rsplit('/',1)[1], 'w') as f:
        #     f.write(html.encode('utf-8'))

        soup = BeautifulSoup(html)
        videoWrapper = soup.find(id='videoWrapper')
        movie_title = videoWrapper.find('h1').text
        if DEBUG: print movie_title
        score = videoWrapper.find('p', {'class':'score'})
        rater1, rater2, dummy = score.text.split(':')
        star1 = None
        star2 = None
        for img in score.findAll('img'):
            star = img.get('alt').split(' ')[0]
            if star1 is None:
                star1 = star
            else:
                star2 = star
        if DEBUG: print rater1, rater2
        if DEBUG: print star1, star2
        line =  ','.join([movie_title, rater1, star1, rater2, star2, url]).encode('utf-8')
        print line
        with file('abc_movie_rating.csv','a') as f:
            f.write(line + '\n');

    except Exception as e:
        print url
        print e.message
        with file('error','a') as f:
            f.write(url + ',' + e.message + '\n')

def parse_file(name):
    url = 'http://www.abc.net.au/atthemovies/txt/' + name
    with file('movies/' + name, 'r') as f:
        html = f.read()
        soup = BeautifulSoup(html.decode('utf-8'))
        content = soup.find(id='content')
        # if videoWrapper is None:
        #     videoWrapper = soup.find(id='storyImage')
        movie_title = content.find('h1').text
        if DEBUG: print movie_title
        score = content.find('p', {'class':'score'})
        raters = score.text.split(':')
        try: rater1 = raters[0]
        except IndexError: rater1 = ''
        try: rater2 = raters[1]
        except IndexError: rater2 = ''
        star1 = ''
        star2 = ''
        for img in score.findAll('img'):
            star = img.get('alt').split(' ')[0]
            if star1 is '':
                star1 = star
            else:
                star2 = star
        if DEBUG: print rater1, rater2
        if DEBUG: print star1, star2
        # line =  ','.join([movie_title, rater1, star1, rater2, star2, url]).encode('utf-8')
        print movie_title, url
        # with file('abc_movie_rating.csv','a') as ff:
        #     ff.write(line + '\n')
        movie = Movie(title=movie_title, rater1=rater1, star1=star1, rater2=rater2, star2=star2, url=url)

        try:
            description = content.find('p', {'class': 'description'}).text
        except:
            description = ''
        movie.description = description

        review_content = ''
        for p in content.findAllNext('p'):
            if p.get('class', None) == 'description':
                continue
            if p.get('class', None) == 'score':
                continue
            if p.findNext().name == 'i':
                break
            review_content += p.text
        movie.review_content = review_content

        production_details = soup.find('p', {'class': 'moviedetails'}).contents
        movie.production_details = production_details


        # comments
        allNewComments = soup.find(id='loadComments')
        if allNewComments is not None:
            for li in allNewComments.findAll('li'):
                comment = Comments()
                comment.title = movie_title
                comment.rater = li.find('h3').text
                comment.comment = li.find('p', {'class': 'comment'}).text
                m = re.search('\[\[(\d+)\]\]', comment.comment)
                if m:
                    comment.rating=m.groups(1)[0]
                else:
                    comment.rating='N/A'
                comment.save()
        audiencereview1 = soup.findAll(id='audiencereview1')
        for aud in audiencereview1:
            try:
                comment = Comments()
                comment.title = movie_title
                comment.rater = aud.find('p').contents[1].replace('&nbsp;', '', 1)
                comment.rating = aud.find('p').find('img').get('src').rsplit('/',1)[1][:-4]
                comment.comment = aud.find('p').contents[27].replace('&nbsp;', '', 1)
                comment.save()
            except Exception as ex:
                print ex.message


        print movie.save()

def parse_year(url):
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html)
    content = soup.find(id='content')
    for h3 in content.findAll(name='h3'):
        parse_movie(h3.find('a').get('href'))
        if DEBUG: break


# for y in years:
#     parse_year(base_url.format(str(y)))
#     if DEBUG: break;
# parse_file('s1148397.htm')
# parse_file('s3903329.htm')
for f in os.listdir('movies'):
    parse_file(f)
	from BeautifulSoup import BeautifulSoup
	import requests
	import os
	import re
	import peewee
	from peewee import *
	db = MySQLDatabase('movies',host="localhost",user='root')

	db.connect()
	class Movie(peewee.Model):
	title = peewee.TextField()
	rater1 = peewee.TextField()
	star1 = peewee.TextField()
	rater2 = peewee.TextField()
	star2 = peewee.TextField()
	description = peewee.TextField()
	review_content = peewee.TextField()
	production_details = peewee.TextField()
	url = peewee.TextField()

	class Meta:
	database = db

	class Comments(peewee.Model):
	title = peewee.TextField()
	rater = peewee.TextField()
	rating = peewee.TextField()
	comment = peewee.TextField()

	class Meta:
	database = db


	Movie.drop_table(fail_silently=True)
	Movie.create_table()

	Comments.drop_table(fail_silently=True)
	Comments.create_table()
	# Movie.delete().where(True).execute()

	base_url = 'http://www.abc.net.au/atthemovies/review/byyear/{0}.htm'
	years = range(2014, 2003, -1)
	DEBUG = False

	def parse_movie(url):
	try:
	r = requests.get(url)
	html = r.text
	# save to file
	# with file('movies/' + url.rsplit('/',1)[1], 'w') as f:
	# f.write(html.encode('utf-8'))

	soup = BeautifulSoup(html)
	videoWrapper = soup.find(id='videoWrapper')
	movie_title = videoWrapper.find('h1').text
	if DEBUG: print movie_title
	score = videoWrapper.find('p', {'class':'score'})
	rater1, rater2, dummy = score.text.split(':')
	star1 = None
	star2 = None
	for img in score.findAll('img'):
	star = img.get('alt').split(' ')[0]
	if star1 is None:
	star1 = star
	else:
	star2 = star
	if DEBUG: print rater1, rater2
	if DEBUG: print star1, star2
	line = ','.join([movie_title, rater1, star1, rater2, star2, url]).encode('utf-8')
	print line
	with file('abc_movie_rating.csv','a') as f:
	f.write(line + '\n');

	except Exception as e:
	print url
	print e.message
	with file('error','a') as f:
	f.write(url + ',' + e.message + '\n')

	def parse_file(name):
	url = 'http://www.abc.net.au/atthemovies/txt/' + name
	with file('movies/' + name, 'r') as f:
	html = f.read()
	soup = BeautifulSoup(html.decode('utf-8'))
	content = soup.find(id='content')
	# if videoWrapper is None:
	# videoWrapper = soup.find(id='storyImage')
	movie_title = content.find('h1').text
	if DEBUG: print movie_title
	score = content.find('p', {'class':'score'})
	raters = score.text.split(':')
	try: rater1 = raters[0]
	except IndexError: rater1 = ''
	try: rater2 = raters[1]
	except IndexError: rater2 = ''
	star1 = ''
	star2 = ''
	for img in score.findAll('img'):
	star = img.get('alt').split(' ')[0]
	if star1 is '':
	star1 = star
	else:
	star2 = star
	if DEBUG: print rater1, rater2
	if DEBUG: print star1, star2
	# line = ','.join([movie_title, rater1, star1, rater2, star2, url]).encode('utf-8')
	print movie_title, url
	# with file('abc_movie_rating.csv','a') as ff:
	# ff.write(line + '\n')
	movie = Movie(title=movie_title, rater1=rater1, star1=star1, rater2=rater2, star2=star2, url=url)

	try:
	description = content.find('p', {'class': 'description'}).text
	except:
	description = ''
	movie.description = description

	review_content = ''
	for p in content.findAllNext('p'):
	if p.get('class', None) == 'description':
	continue
	if p.get('class', None) == 'score':
	continue
	if p.findNext().name == 'i':
	break
	review_content += p.text
	movie.review_content = review_content

	production_details = soup.find('p', {'class': 'moviedetails'}).contents
	movie.production_details = production_details


	# comments
	allNewComments = soup.find(id='loadComments')
	if allNewComments is not None:
	for li in allNewComments.findAll('li'):
	comment = Comments()
	comment.title = movie_title
	comment.rater = li.find('h3').text
	comment.comment = li.find('p', {'class': 'comment'}).text
	m = re.search('\[\[(\d+)\]\]', comment.comment)
	if m:
	comment.rating=m.groups(1)[0]
	else:
	comment.rating='N/A'
	comment.save()
	audiencereview1 = soup.findAll(id='audiencereview1')
	for aud in audiencereview1:
	try:
	comment = Comments()
	comment.title = movie_title
	comment.rater = aud.find('p').contents[1].replace(' ', '', 1)
	comment.rating = aud.find('p').find('img').get('src').rsplit('/',1)[1][:-4]
	comment.comment = aud.find('p').contents[27].replace(' ', '', 1)
	comment.save()
	except Exception as ex:
	print ex.message






	print movie.save()

	def parse_year(url):
	r = requests.get(url)
	html = r.text
	soup = BeautifulSoup(html)
	content = soup.find(id='content')
	for h3 in content.findAll(name='h3'):
	parse_movie(h3.find('a').get('href'))
	if DEBUG: break


	# for y in years:
	# parse_year(base_url.format(str(y)))
	# if DEBUG: break;
	# parse_file('s1148397.htm')
	# parse_file('s3903329.htm')
	for f in os.listdir('movies'):
	parse_file(f)