Skip to content

Instantly share code, notes, and snippets.

@Aubreymcfato
Created July 14, 2016 13:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Aubreymcfato/a5731a719d9cc036403883de7ae6e2fc to your computer and use it in GitHub Desktop.
Save Aubreymcfato/a5731a719d9cc036403883de7ae6e2fc to your computer and use it in GitHub Desktop.
Python script for the Italian Wikisource: it scrapes ns0 books, getting all the metadata, and also the cover URL from the Index page.
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
import unicodecsv
#uses unicodecsv for generating a CSV. I'm using python 2.7
#if you use Python 3.X, go with "csv" and change the related instructions accordingly.
#Beware of
# csv.writer(open("FILE.csv"), "wt")
#otherwise it gives you an error.
# create output CSV file
out = unicodecsv.writer(open("listametadati.csv", "wb"), encoding="utf-8")
# write HEAD
head = ("TITLE", "TYPE", "URL", "CCE", "SUBJECT", "CREATOR", "CONTRIBUTOR", "DESCRIPTION", "PUBLISHER", "DATE", "FORMAT", "SOURCE", "LANGUAGE", "RELATION", "COVERAGE", "RIGHTS", "IMMAGINE", "IMMAGINE ED.")
out.writerow(head)
# script for retrieving the cover URL from the Index page
def get_cover(page, data_indice):
#page = wikipedia.page(book)
page_cover = requests.get("http://it.wikisource.org/wiki/" + data_indice)
soup_cover = BeautifulSoup(page_cover.text)
cover_line=soup_cover.find("img", { "class" : "thumbimage" })
if cover_line is not None:
cover_semiurl = cover_line['src']
cover_url = "http:" + str(cover_semiurl)
else:
cover_url = ""
return cover_url
#script for getting all the metadata.
#it basically scrapes them using the microformat using BeautifulSoup
def get_bookmetadata(book):
#get the page HTML
titolo = "http://it.wikisource.org/wiki/" + book[0]
print titolo
page = requests.get(titolo)
#make it a 'soup'
soup = BeautifulSoup(page.text)
data = soup.find("span", { "id" : "dati" })
#find topic
data_argomento =data['data-argomento']
#find Index URL
data_indice = data['data-urldellaversionecartaceaafronte']
print data_indice
#find all the metadata
try:
titles = soup.find ('span', attrs={"id" : "ws-title"}) #se vede titoli troppo lunghi, prende quello della pagina in ns0 e non quello con l'a capo
except HTMLParser.HTMLParseError:
title = ""
try:
authors = soup.find ('span', attrs={"id" : "ws-author"})
except HTMLParser.HTMLParseError:
author = ""
try:
publishers = soup.find ('span', attrs={"id" : "ws-publisher"})
except HTMLParser.HTMLParseError:
publisher = ""
try:
dates = soup.find ('span', attrs={"id" : "ws-year"})
except HTMLParser.HTMLParseError:
date = ""
try:
places = soup.find ('span', attrs={"id" : "ws-place"})
except HTMLParser.HTMLParseError:
place = ""
#this was probably an attempt to generate the Index cover URL...
"""
covers=soup.find ('span', attrs={"data-urldellaversionecartaceaafronte"})
try:
cover_url = "http:" + get_cover()
except HTMLParser.HTMLParseError:
cover_url=""
"""
if titles is not None:
title=titles.text#.encode('ascii', 'ignore')
if authors is not None:
author=authors.text#.encode('ascii', 'ignore')
if publishers is not None:
publisher=publishers.text#.encode('ascii', 'ignore')
if dates is not None:
date=dates.text#.encode('ascii', 'ignore')
if places is not None:
place=places.text#.encode('ascii', 'ignore')
if data_indice is not None:
#print "Abbiamo la versione testuale da qualche parte!"
cover_url=get_cover(page, data_indice)
else:
cover_url=""
out.writerow([unicode(title), "E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=it&format=epub&page=" + unicode(book[0]), None, unicode(data_argomento), unicode(author), None, None, u"Wikisource, la biblioteca libera. <it.wikisource.org>", None, u"HTML | EPUB", unicode(date) + " | " + unicode(publisher) + " | " + unicode(place), u"Italiano", u"http://it.wikisource.org/wiki/"+ unicode(book[0]), None, u"Pubblico dominio", unicode(cover_url), u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"])
#I read the books from an input CSV.
#of course every file is OK, but the important data is the ns0 title
books=unicodecsv.reader(open("listalibri.csv"), encoding="utf-8")
for book in books:
get_bookmetadata(book)
print "everything is ok"
"""
it is probably possible to write rewrite everything using Pywikibot and parsing the templates...
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment