wenming/gutenbergcrawl

## gutenbergcrawl
#!/usr/bin/python
# version 0.1 Wenming Ye   2/25/2012
#Extract English and Text only content out of the Gutenberg DVD. 2010
# If you have questions, please contact me for the latest version.
# feel free to modify the scripts to your needs.
# STEP 1: Run this in the Cygwin Environment.  if you don't want to use Cygwin, you can modify "cp command embeded in the script".
# This file parses the html index pages (TITLES) and find english Language books and their ZIP resource URLs.
# Run this in the gutenberg main INDEXES dir in gutenberg  "www.gutenberg.org/INDEXES"
# Removes pdf, html, and images, and non-english items.   All the zip files will be copied into the INDEXES/zips
# STEP 2: Then you can extract all the zip files by running >>>>find ./ -name "*.zip" -exec unzip -o {} \;<<<<
# STEP 3: find ./ -name "*.txt" // that's your list of text.   You should see about 26942 total # of text files.
# STEP 4:  remove *readme.txt   you can use the find utility again. find ./ -name "*readme.txt" -exec rm {} \;
# STEP 5:  YOU CAN DO THAT FOR htm, html, etc.
# You will end up with 26900 relatively clean set of files.  find ./ -name "*.txt" -exec cp {} my_text_dir \;
# TODO:   get rid of UTF8 duplicates vs. ASCII.

from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
import urllib
import os
import commands

# Class for parsing Book HTML page to extract the ZIP (actual URL for the books).
class BookPropertyHTMLParser(HTMLParser):
  def __init__(self):
		HTMLParser.__init__(self)
		self.url_list = []

	def handle_starttag(self, tag, attrs):
		if (tag == "a"):
			for attr in attrs:
				attr_string = "".join(attr)
				attr_string = attr_string[4:]

				if ((attr_string.count(".zip") != 0) or (attr_string.count(".txt") != 0)):
					if (attr_string.count("h.zip") != 0):#  remove anything that ends with h.zip( verfified)
						continue
						# pass, do nothing
					elif (attr_string.count("_images.zip") != 0):
						continue
						# pass, do nothing
					elif (attr_string.count("_pdf.zip") != 0):
						# pass, do nothing
						continue
					else:
						self.url_list.append(attr_string)
						commands.getstatusoutput('cp ' + attr_string + " zips") # change to xcopy for windows cmd.

# parsing the title page to find any English language book (English)
class TitleFilesHTMLParser(HTMLParser):
	def __init__(self):
		HTMLParser.__init__(self)
		self.book_title = ""
		self.book_attr = ""
		self.book_property_list = []

	def handle_starttag(self, tag, attrs):
		if (tag == "h3"):
			self.book_title = ""
			self.book_attr = ""
		if (tag == "a"):
			for attr in attrs:
				self.book_attr += "".join(attr)
				self.book_attr = self.book_attr[4:]

	def handle_endtag(self, tag):
		if (tag == "h3"):
			if (self.book_title.count("(English)") !=0):
				self.book_property_list.append(self.book_attr)

	def handle_data(self, data):
		self.book_title += data

# get the zip URLs on the Book HTML property page.
def get_zip_urls(book_url):
	book_url_string = "file://"+os.getcwd()+"/" + book_url
	book_page_file = urllib.urlopen(book_url_string)
	book_page_file_string = book_page_file.read()
	book_page_file.close()
	book_page_parser = BookPropertyHTMLParser()
	book_page_parser.feed(book_page_file_string)
	print book_url, book_page_parser.url_list  # you might want to get rid of duplicates for each book.  Some of them have utf8, and ASCII.


# Loop through the title page and find all the Book Property URLs.
def get_english_only_urls(title_page_url):
	file = urllib.urlopen(title_page_url)
	file_string = file.read()
	file.close()
	parser = TitleFilesHTMLParser()
	parser.feed(file_string)
	global total_books
	total_books += len(parser.book_property_list)
	#parser.book_property_list = []
	#parser.book_property_list.append('../etext/28964.html')

	#  go parse each file and get the zip file URL
	for book_url in parser.book_property_list:
		get_zip_urls(book_url)


#  MAIN FUNCTION HERE  run this in the gutenberg main INDEXES dir in gutenberg  "www.gutenberg.org/INDEXES"

# get a list of the title pages a-z, other
if not os.path.exists("zips"):
    os.makedirs("zips")

titleFileList = []
total_books = 0
for i in range(ord('A'), ord('Z')+1):
	titleFileList.append(chr(i))
titleFileList.append('OTHER')

# now for the title page list, find the URL for the Book's HTML description page.
# On the description page extract the ZIP file URL for the actual book
for i in titleFileList:
	title_page_url = "file://"+os.getcwd()+"/TITLES_" + i + ".HTML"
	get_english_only_urls(title_page_url)

print total_books
#
	#!/usr/bin/python
	# version 0.1 Wenming Ye 2/25/2012
	#Extract English and Text only content out of the Gutenberg DVD. 2010
	# If you have questions, please contact me for the latest version.
	# feel free to modify the scripts to your needs.
	# STEP 1: Run this in the Cygwin Environment. if you don't want to use Cygwin, you can modify "cp command embeded in the script".
	# This file parses the html index pages (TITLES) and find english Language books and their ZIP resource URLs.
	# Run this in the gutenberg main INDEXES dir in gutenberg "www.gutenberg.org/INDEXES"
	# Removes pdf, html, and images, and non-english items. All the zip files will be copied into the INDEXES/zips
	# STEP 2: Then you can extract all the zip files by running >>>>find ./ -name "*.zip" -exec unzip -o {} \;<<<<
	# STEP 3: find ./ -name "*.txt" // that's your list of text. You should see about 26942 total # of text files.
	# STEP 4: remove readme.txt you can use the find utility again. find ./ -name "readme.txt" -exec rm {} \;
	# STEP 5: YOU CAN DO THAT FOR htm, html, etc.
	# You will end up with 26900 relatively clean set of files. find ./ -name "*.txt" -exec cp {} my_text_dir \;
	# TODO: get rid of UTF8 duplicates vs. ASCII.

	from HTMLParser import HTMLParser
	from htmlentitydefs import name2codepoint
	import urllib
	import os
	import commands

	# Class for parsing Book HTML page to extract the ZIP (actual URL for the books).
	class BookPropertyHTMLParser(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self.url_list = []

	def handle_starttag(self, tag, attrs):
	if (tag == "a"):
	for attr in attrs:
	attr_string = "".join(attr)
	attr_string = attr_string[4:]

	if ((attr_string.count(".zip") != 0) or (attr_string.count(".txt") != 0)):
	if (attr_string.count("h.zip") != 0):# remove anything that ends with h.zip( verfified)
	continue
	# pass, do nothing
	elif (attr_string.count("_images.zip") != 0):
	continue
	# pass, do nothing
	elif (attr_string.count("_pdf.zip") != 0):
	# pass, do nothing
	continue
	else:
	self.url_list.append(attr_string)
	commands.getstatusoutput('cp ' + attr_string + " zips") # change to xcopy for windows cmd.

	# parsing the title page to find any English language book (English)
	class TitleFilesHTMLParser(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self.book_title = ""
	self.book_attr = ""
	self.book_property_list = []

	def handle_starttag(self, tag, attrs):
	if (tag == "h3"):
	self.book_title = ""
	self.book_attr = ""
	if (tag == "a"):
	for attr in attrs:
	self.book_attr += "".join(attr)
	self.book_attr = self.book_attr[4:]

	def handle_endtag(self, tag):
	if (tag == "h3"):
	if (self.book_title.count("(English)") !=0):
	self.book_property_list.append(self.book_attr)

	def handle_data(self, data):
	self.book_title += data

	# get the zip URLs on the Book HTML property page.
	def get_zip_urls(book_url):
	book_url_string = "file://"+os.getcwd()+"/" + book_url
	book_page_file = urllib.urlopen(book_url_string)
	book_page_file_string = book_page_file.read()
	book_page_file.close()
	book_page_parser = BookPropertyHTMLParser()
	book_page_parser.feed(book_page_file_string)
	print book_url, book_page_parser.url_list # you might want to get rid of duplicates for each book. Some of them have utf8, and ASCII.


	# Loop through the title page and find all the Book Property URLs.
	def get_english_only_urls(title_page_url):
	file = urllib.urlopen(title_page_url)
	file_string = file.read()
	file.close()
	parser = TitleFilesHTMLParser()
	parser.feed(file_string)
	global total_books
	total_books += len(parser.book_property_list)
	#parser.book_property_list = []
	#parser.book_property_list.append('../etext/28964.html')

	# go parse each file and get the zip file URL
	for book_url in parser.book_property_list:
	get_zip_urls(book_url)


	# MAIN FUNCTION HERE run this in the gutenberg main INDEXES dir in gutenberg "www.gutenberg.org/INDEXES"

	# get a list of the title pages a-z, other
	if not os.path.exists("zips"):
	os.makedirs("zips")

	titleFileList = []
	total_books = 0
	for i in range(ord('A'), ord('Z')+1):
	titleFileList.append(chr(i))
	titleFileList.append('OTHER')

	# now for the title page list, find the URL for the Book's HTML description page.
	# On the description page extract the ZIP file URL for the actual book
	for i in titleFileList:
	title_page_url = "file://"+os.getcwd()+"/TITLES_" + i + ".HTML"
	get_english_only_urls(title_page_url)

	print total_books
	#