Skip to content

Instantly share code, notes, and snippets.

Last active August 29, 2015 14:17
Show Gist options
  • Save andrewheiss/3e4786d9685ff9057d30 to your computer and use it in GitHub Desktop.
Save andrewheiss/3e4786d9685ff9057d30 to your computer and use it in GitHub Desktop.
Scrape Harry Potter
#!/usr/bin/env python3
# --------------
# Load modules
# --------------
from bs4 import BeautifulSoup
from collections import defaultdict
from random import choice
from time import sleep
import urllib.request
import csv
import logging
# --------------------------------
# Scraping and parsing functions
# --------------------------------
def get_url(url):
user_agents = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 Safari/537.85.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36'
agent = choice(user_agents)
wait = choice(range(10, 17))
response = urllib.request.Request(url, headers={'User-Agent': agent})
handler = urllib.request.urlopen(response).read()
def parse_chapter(url):
print("Parsing {0}".format(url))
soup = BeautifulSoup(get_url(url))
logging.warning("Error with {0}".format(url))
print("Error. Moving on.")
content_section ='.ContentCss')
content_clean = [x.text.replace('\u3000', '') for x in content_section][0]
content = {}
content['content'] = content_clean.split('\n')
content['url'] = url
split_url = url.split('/')[-1].replace('.html', '').split('_')
content['book'] = ' '.join(split_url[0:-1])
content['chapter'] = split_url[-1]
def convert_to_long(chapter):
long_results = defaultdict(dict)
entry_num = -1
for x in chapter['content']:
entry_num += 1
long_results[entry_num]['book'] = chapter['book']
long_results[entry_num]['chapter'] = chapter['chapter']
long_results[entry_num]['paragraph'] = entry_num + 1
long_results[entry_num]['text'] = x
# --------------------------------
# Build a list of URLs to scrape
# --------------------------------
url_base = ''
specific_books = ['Harry_Potter_and_the_Sorcerers_Stone/Harry_Potter_and_the_Sorcerers_Stone_CHAPTER.html',
chapters = [17, 19, 22, 37, 38, 30, 37]
urls_to_parse = []
for i in range(0, len(specific_books)):
for j in range(1, chapters[i]+1):
final_url = url_base + specific_books[i].replace('CHAPTER', str(j))
# ---------------------------------------
# Scrape all those URLs and save to CSV
# ---------------------------------------
# # Set up log
# logging.basicConfig(filename='errors.log', filemode='w', level=logging.DEBUG,
# format='%(levelname)s %(asctime)s: %(message)s')
# logging.captureWarnings(True)
# csv_started = False
# fieldnames = ['book', 'chapter', 'paragraph', 'text']
# for url in urls_to_parse[0:2]:
# contents = convert_to_long(parse_chapter(url))
# for key, value in contents.items():
# w = csv.DictWriter(open('hp.csv', 'a'), fieldnames)
# if csv_started is False:
# w.writeheader()
# csv_started = True
# w.writerow(value)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment