Skip to content

Instantly share code, notes, and snippets.

@wenming
Last active December 15, 2015 14:58
Show Gist options
  • Save wenming/5277993 to your computer and use it in GitHub Desktop.
Save wenming/5277993 to your computer and use it in GitHub Desktop.
Gutenberg crawler that copies english only documents
#!/usr/bin/python
# version 0.1 Wenming Ye 2/25/2012
#Extract English and Text only content out of the Gutenberg DVD. 2010
# If you have questions, please contact me for the latest version.
# feel free to modify the scripts to your needs.
# STEP 1: Run this in the Cygwin Environment. if you don't want to use Cygwin, you can modify "cp command embeded in the script".
# This file parses the html index pages (TITLES) and find english Language books and their ZIP resource URLs.
# Run this in the gutenberg main INDEXES dir in gutenberg "www.gutenberg.org/INDEXES"
# Removes pdf, html, and images, and non-english items. All the zip files will be copied into the INDEXES/zips
# STEP 2: Then you can extract all the zip files by running >>>>find ./ -name "*.zip" -exec unzip -o {} \;<<<<
# STEP 3: find ./ -name "*.txt" // that's your list of text. You should see about 26942 total # of text files.
# STEP 4: remove *readme.txt you can use the find utility again. find ./ -name "*readme.txt" -exec rm {} \;
# STEP 5: YOU CAN DO THAT FOR htm, html, etc.
# You will end up with 26900 relatively clean set of files. find ./ -name "*.txt" -exec cp {} my_text_dir \;
# TODO: get rid of UTF8 duplicates vs. ASCII.
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
import urllib
import os
import commands
# Class for parsing Book HTML page to extract the ZIP (actual URL for the books).
class BookPropertyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.url_list = []
def handle_starttag(self, tag, attrs):
if (tag == "a"):
for attr in attrs:
attr_string = "".join(attr)
attr_string = attr_string[4:]
if ((attr_string.count(".zip") != 0) or (attr_string.count(".txt") != 0)):
if (attr_string.count("h.zip") != 0):# remove anything that ends with h.zip( verfified)
continue
# pass, do nothing
elif (attr_string.count("_images.zip") != 0):
continue
# pass, do nothing
elif (attr_string.count("_pdf.zip") != 0):
# pass, do nothing
continue
else:
self.url_list.append(attr_string)
commands.getstatusoutput('cp ' + attr_string + " zips") # change to xcopy for windows cmd.
# parsing the title page to find any English language book (English)
class TitleFilesHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.book_title = ""
self.book_attr = ""
self.book_property_list = []
def handle_starttag(self, tag, attrs):
if (tag == "h3"):
self.book_title = ""
self.book_attr = ""
if (tag == "a"):
for attr in attrs:
self.book_attr += "".join(attr)
self.book_attr = self.book_attr[4:]
def handle_endtag(self, tag):
if (tag == "h3"):
if (self.book_title.count("(English)") !=0):
self.book_property_list.append(self.book_attr)
def handle_data(self, data):
self.book_title += data
# get the zip URLs on the Book HTML property page.
def get_zip_urls(book_url):
book_url_string = "file://"+os.getcwd()+"/" + book_url
book_page_file = urllib.urlopen(book_url_string)
book_page_file_string = book_page_file.read()
book_page_file.close()
book_page_parser = BookPropertyHTMLParser()
book_page_parser.feed(book_page_file_string)
print book_url, book_page_parser.url_list # you might want to get rid of duplicates for each book. Some of them have utf8, and ASCII.
# Loop through the title page and find all the Book Property URLs.
def get_english_only_urls(title_page_url):
file = urllib.urlopen(title_page_url)
file_string = file.read()
file.close()
parser = TitleFilesHTMLParser()
parser.feed(file_string)
global total_books
total_books += len(parser.book_property_list)
#parser.book_property_list = []
#parser.book_property_list.append('../etext/28964.html')
# go parse each file and get the zip file URL
for book_url in parser.book_property_list:
get_zip_urls(book_url)
# MAIN FUNCTION HERE run this in the gutenberg main INDEXES dir in gutenberg "www.gutenberg.org/INDEXES"
# get a list of the title pages a-z, other
if not os.path.exists("zips"):
os.makedirs("zips")
titleFileList = []
total_books = 0
for i in range(ord('A'), ord('Z')+1):
titleFileList.append(chr(i))
titleFileList.append('OTHER')
# now for the title page list, find the URL for the Book's HTML description page.
# On the description page extract the ZIP file URL for the actual book
for i in titleFileList:
title_page_url = "file://"+os.getcwd()+"/TITLES_" + i + ".HTML"
get_english_only_urls(title_page_url)
print total_books
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment