Skip to content

Instantly share code, notes, and snippets.

@Aubreymcfato
Created July 14, 2016 14:34
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Aubreymcfato/e8cf89aa71abe4f5fcd7243644e625af to your computer and use it in GitHub Desktop.
Save Aubreymcfato/e8cf89aa71abe4f5fcd7243644e625af to your computer and use it in GitHub Desktop.
Python script for the English Wikisource: it scrapes ns0 books, getting all the metadata, and also other metadata from the Index page.
import requests
from bs4 import BeautifulSoup
import unicodecsv
import re
#uses unicodecsv for generating a CSV. I'm using python 2.7
#if you use Python 3.X, go with "csv" and change the related instructions accordingly.
#Beware of
# csv.writer(open("FILE.csv"), "wt")
#otherwise it gives you an error.
# create output CSV file
out = unicodecsv.writer(open("en_metadata.csv", "wb"), encoding="utf-8")
head = ("TITLE", "TYPE", "URL", "CCE", "SUBJECT", "CREATOR", "CONTRIBUTOR", "DESCRIPTION", "PUBLISHER", "DATE", "FORMAT", "SOURCE", "LANGUAGE", "RELATION", "COVERAGE", "RIGHTS", "IMMAGINE", "IMMAGINE ED.")
out.writerow(head)
# script for retrieving the metadata from the Index page
# bit different from itws_scraper.py: here I'm using regex
# and also I'm taking many other data which are not in the ns0
def get_index_metadata(index):
rr = requests.get("https://en.wikisource.org/wiki/" + index)
#if it were the only image, this would do: image= soup_cover.img['src']
try:
image= re.search(r"//upload\.wikimedia\.org/(.*?)\.djvu\.jpg", rr.text)
cover_url= "https:" + image.group(0)
except AttributeError:
cover_url = None
data = BeautifulSoup(rr.text)
try:
publishers = data.find('td', attrs={"id" : "ws-publisher"})
publisher = publishers.text
except AttributeError:
publisher = None
try:
places = data.find ('td', attrs={"id" : "ws-place"})
place=places.text
except AttributeError:
place = None
try:
dates = data.find ('td', attrs={"id" : "ws-year"})
date = dates.text
except AttributeError:
date = None
return cover_url, publisher, place, date
#script for getting all the metadata.
#it basically scrapes them using the microformat using BeautifulSoup
def get_bookmetadata(book):
book = book.strip()
book_url = "https://en.wikisource.org/wiki/" + book
r = requests.get(book_url)
soup = BeautifulSoup(r.text)
try:
titles = soup.find ('span', attrs={"id" : "ws-title"}) #I think there was a problem with titles too long (they break)
title=titles.text
except AttributeError:
title = None
try:
authors = soup.find ('span', attrs={"id" : "ws-author"})
author = authors.text
except AttributeError:
author = None
try:
years = soup.find ('span', attrs={"id" : "ws-year"})
year = years.text
except AttributeError:
year = None
i=re.search(r"Index:(.*)\.djvu", r.text)
#not all the books have the Index, here's the YES option
if i is not None:
index= i.group(0)
metadata=get_index_metadata(index)
cover_url=metadata[0]
publisher = metadata[1]
place = metadata[2]
date = metadata[3]
else:
index = None
cover_url=None
publisher = None
place = None
date = None
#not sure, but I think there was an issue for this script to work for both books: with and without Index page.
#Here I wrote 2 different instructions for the output CSV
#without Source
#out.writerow([unicode(title), "E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=en&format=epub&page=" + unicode(book), None, None, unicode(author), None, None, u"Wikisource, la biblioteca libera. <en.wikisource.org>", None, u"HTML | EPUB", unicode(date), u"inglese", u"http://en.wikisource.org/wiki/"+ unicode(book), None, u"Pubblico dominio", "cover_url", u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"])
#with Source
out.writerow([unicode(title),"E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=en&format=epub&page=" + unicode(book), None, None, unicode(author), None, None, u"Wikisource, la biblioteca libera. <en.wikisource.org>", year, u"HTML | EPUB", unicode(date) + " | " + unicode(publisher) + " | " + unicode(place), u"inglese", u"http://en.wikisource.org/wiki/"+ book, None, u"Pubblico dominio", cover_url, u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"])
#I read the books from an input list.
#of course every file (CSV, etc.) is OK, but the important data is the ns0 title
with open("en_list") as books:
for book in books:
get_bookmetadata(book)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment