Last active
December 15, 2015 14:58
-
-
Save wenming/5277993 to your computer and use it in GitHub Desktop.
Gutenberg crawler that copies english only documents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# version 0.1 Wenming Ye 2/25/2012 | |
#Extract English and Text only content out of the Gutenberg DVD. 2010 | |
# If you have questions, please contact me for the latest version. | |
# feel free to modify the scripts to your needs. | |
# STEP 1: Run this in the Cygwin Environment. if you don't want to use Cygwin, you can modify "cp command embeded in the script". | |
# This file parses the html index pages (TITLES) and find english Language books and their ZIP resource URLs. | |
# Run this in the gutenberg main INDEXES dir in gutenberg "www.gutenberg.org/INDEXES" | |
# Removes pdf, html, and images, and non-english items. All the zip files will be copied into the INDEXES/zips | |
# STEP 2: Then you can extract all the zip files by running >>>>find ./ -name "*.zip" -exec unzip -o {} \;<<<< | |
# STEP 3: find ./ -name "*.txt" // that's your list of text. You should see about 26942 total # of text files. | |
# STEP 4: remove *readme.txt you can use the find utility again. find ./ -name "*readme.txt" -exec rm {} \; | |
# STEP 5: YOU CAN DO THAT FOR htm, html, etc. | |
# You will end up with 26900 relatively clean set of files. find ./ -name "*.txt" -exec cp {} my_text_dir \; | |
# TODO: get rid of UTF8 duplicates vs. ASCII. | |
from HTMLParser import HTMLParser | |
from htmlentitydefs import name2codepoint | |
import urllib | |
import os | |
import commands | |
# Class for parsing Book HTML page to extract the ZIP (actual URL for the books). | |
class BookPropertyHTMLParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.url_list = [] | |
def handle_starttag(self, tag, attrs): | |
if (tag == "a"): | |
for attr in attrs: | |
attr_string = "".join(attr) | |
attr_string = attr_string[4:] | |
if ((attr_string.count(".zip") != 0) or (attr_string.count(".txt") != 0)): | |
if (attr_string.count("h.zip") != 0):# remove anything that ends with h.zip( verfified) | |
continue | |
# pass, do nothing | |
elif (attr_string.count("_images.zip") != 0): | |
continue | |
# pass, do nothing | |
elif (attr_string.count("_pdf.zip") != 0): | |
# pass, do nothing | |
continue | |
else: | |
self.url_list.append(attr_string) | |
commands.getstatusoutput('cp ' + attr_string + " zips") # change to xcopy for windows cmd. | |
# parsing the title page to find any English language book (English) | |
class TitleFilesHTMLParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.book_title = "" | |
self.book_attr = "" | |
self.book_property_list = [] | |
def handle_starttag(self, tag, attrs): | |
if (tag == "h3"): | |
self.book_title = "" | |
self.book_attr = "" | |
if (tag == "a"): | |
for attr in attrs: | |
self.book_attr += "".join(attr) | |
self.book_attr = self.book_attr[4:] | |
def handle_endtag(self, tag): | |
if (tag == "h3"): | |
if (self.book_title.count("(English)") !=0): | |
self.book_property_list.append(self.book_attr) | |
def handle_data(self, data): | |
self.book_title += data | |
# get the zip URLs on the Book HTML property page. | |
def get_zip_urls(book_url): | |
book_url_string = "file://"+os.getcwd()+"/" + book_url | |
book_page_file = urllib.urlopen(book_url_string) | |
book_page_file_string = book_page_file.read() | |
book_page_file.close() | |
book_page_parser = BookPropertyHTMLParser() | |
book_page_parser.feed(book_page_file_string) | |
print book_url, book_page_parser.url_list # you might want to get rid of duplicates for each book. Some of them have utf8, and ASCII. | |
# Loop through the title page and find all the Book Property URLs. | |
def get_english_only_urls(title_page_url): | |
file = urllib.urlopen(title_page_url) | |
file_string = file.read() | |
file.close() | |
parser = TitleFilesHTMLParser() | |
parser.feed(file_string) | |
global total_books | |
total_books += len(parser.book_property_list) | |
#parser.book_property_list = [] | |
#parser.book_property_list.append('../etext/28964.html') | |
# go parse each file and get the zip file URL | |
for book_url in parser.book_property_list: | |
get_zip_urls(book_url) | |
# MAIN FUNCTION HERE run this in the gutenberg main INDEXES dir in gutenberg "www.gutenberg.org/INDEXES" | |
# get a list of the title pages a-z, other | |
if not os.path.exists("zips"): | |
os.makedirs("zips") | |
titleFileList = [] | |
total_books = 0 | |
for i in range(ord('A'), ord('Z')+1): | |
titleFileList.append(chr(i)) | |
titleFileList.append('OTHER') | |
# now for the title page list, find the URL for the Book's HTML description page. | |
# On the description page extract the ZIP file URL for the actual book | |
for i in titleFileList: | |
title_page_url = "file://"+os.getcwd()+"/TITLES_" + i + ".HTML" | |
get_english_only_urls(title_page_url) | |
print total_books | |
# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment