Skip to content

Instantly share code, notes, and snippets.

@Juancard
Created September 10, 2017 16:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Juancard/bb09f19c79a008db58ccfd58e5674ac7 to your computer and use it in GitHub Desktop.
Save Juancard/bb09f19c79a008db58ccfd58e5674ac7 to your computer and use it in GitHub Desktop.
A script to load all spanish books in gutenberg
import os
import sys
import argparse
import logging
import requests
import unicodedata
import re
import codecs
import lxml.html
API_URL = "https://gutenbergapi.org"
def slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
"""
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
value = unicode(re.sub('[-\s]+', '-', value))
return value
def loadArgParser():
parser = argparse.ArgumentParser(description='A script to load all spanish books in gutenberg')
parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true")
return parser.parse_args()
def booksIdByLanguage(lang):
url = API_URL + "/search/language eq " + lang
response = requests.get(url)
booksId = set()
for t in response.json()["texts"]:
booksId.add(t["text_id"])
return booksId
def metadataByBookId(bookId):
url = API_URL + "/texts/%d" % bookId
metadata = {}
response = requests.get(url)
filename = "%d" % bookId
for t in response.json()["metadata"]["title"]:
if t is not None:
filename += "-" + slugify(t)
metadata["filename"] = filename
for t in response.json()["metadata"]["formaturi"]:
if t.endswith(".htm"):
metadata["link_htm"] = t
return metadata
def bodyByBookId(bookId):
url = API_URL + "/texts/%d" % bookId + "/body"
response = requests.get(url)
if not "body" in response.json():
return ""
return response.json()["body"]
def bookTxt(link):
txt = bodyByBookId(link)
if txt is None or txt == "":
return ""
start = txt.find("*** START OF")
char_searched = '\n'
for i in range(start, len(txt)):
if txt[i] == char_searched:
start = i + 1
break
end = txt.rfind("*** END OF")
return txt[start:end]
def bookHtm(link):
htmltree = lxml.html.parse(link)
p_tags = htmltree.xpath('//p')
p_content = [p.text_content() for p in p_tags]
return "\n".join(p_content)
def main():
args = loadArgParser()
if args.verbose:
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG)
lang = "es"
corpus_dir = "corpus/"
if not os.path.exists(corpus_dir):
os.mkdir(corpus_dir)
bookIds = booksIdByLanguage(lang)
processed = 0
allBooks = len(bookIds)
booksNotAdded = 0
for bId in bookIds:
meta = metadataByBookId(bId)
processed += 1
print "%d/%d: %s" % (processed, allBooks, meta["filename"])
if "link_htm" in meta:
parsed = bookHtm(meta["link_htm"])
else:
parsed = bookTxt(bId)
if parsed is None or parsed == "":
logging.warning("Book not added: no body content.")
booksNotAdded += 1
else:
filePath = os.path.join(corpus_dir, meta["filename"] + ".txt")
with codecs.open(filePath, mode='wt', encoding='utf-8') as f:
f.write(parsed)
print "Books added to corpus: %d" % allBooks
print "Books NOT added to corpus: %d" % booksNotAdded
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment