Aubreymcfato/itws_scraper.py

## itws_scraper.py
#!/usr/bin/env python

import requests
from bs4 import BeautifulSoup
import unicodecsv

#uses unicodecsv for generating a CSV. I'm using python 2.7
#if you use Python 3.X, go with "csv" and change the related instructions accordingly.
#Beware of
#    csv.writer(open("FILE.csv"), "wt")
#otherwise it gives you an error.

# create output CSV file
out = unicodecsv.writer(open("listametadati.csv", "wb"), encoding="utf-8")
# write HEAD
head = ("TITLE", "TYPE", "URL", "CCE", "SUBJECT", "CREATOR", "CONTRIBUTOR", "DESCRIPTION", "PUBLISHER", "DATE", "FORMAT", "SOURCE", "LANGUAGE", "RELATION", "COVERAGE", "RIGHTS", "IMMAGINE", "IMMAGINE ED.")
out.writerow(head)

# script for retrieving the cover URL from the Index page
def get_cover(page, data_indice):
    #page = wikipedia.page(book)
    page_cover = requests.get("http://it.wikisource.org/wiki/" + data_indice)
    soup_cover = BeautifulSoup(page_cover.text)
    cover_line=soup_cover.find("img", { "class" : "thumbimage" })
    if cover_line is not None:
        cover_semiurl = cover_line['src']
        cover_url = "http:" + str(cover_semiurl)
    else:
        cover_url = ""
    return cover_url

#script for getting all the metadata.
#it basically scrapes them using the microformat using BeautifulSoup

def get_bookmetadata(book):
    #get the page HTML
    titolo = "http://it.wikisource.org/wiki/" + book[0]
    print titolo
    page = requests.get(titolo)
    #make it a 'soup'
    soup = BeautifulSoup(page.text)
    data = soup.find("span", { "id" : "dati" })
    #find topic
    data_argomento =data['data-argomento']
    #find Index URL
    data_indice = data['data-urldellaversionecartaceaafronte']
    print data_indice

    #find all the metadata
    try:
        titles = soup.find ('span', attrs={"id" : "ws-title"}) #se vede titoli troppo lunghi, prende quello della pagina in ns0 e non quello con l'a capo
    except HTMLParser.HTMLParseError:
        title = ""
    try:
        authors = soup.find ('span', attrs={"id" : "ws-author"})
    except HTMLParser.HTMLParseError:
        author = ""
    try:
        publishers = soup.find ('span', attrs={"id" : "ws-publisher"})
    except HTMLParser.HTMLParseError:
        publisher = ""
    try:
        dates = soup.find ('span', attrs={"id" : "ws-year"})
    except HTMLParser.HTMLParseError:
        date = ""
    try:
        places = soup.find ('span', attrs={"id" : "ws-place"})
    except HTMLParser.HTMLParseError:
        place = ""

    #this was probably an attempt to generate the Index cover URL...

    """
    covers=soup.find ('span', attrs={"data-urldellaversionecartaceaafronte"})
    try:
        cover_url = "http:" + get_cover()
    except HTMLParser.HTMLParseError:
        cover_url=""
    """
    if titles is not None:
        title=titles.text#.encode('ascii', 'ignore')
    if authors is not None:
        author=authors.text#.encode('ascii', 'ignore')
    if publishers is not None:
        publisher=publishers.text#.encode('ascii', 'ignore')
    if dates is not None:
        date=dates.text#.encode('ascii', 'ignore')
    if places is not None:
        place=places.text#.encode('ascii', 'ignore')
    if data_indice is not None:
        #print "Abbiamo la versione testuale da qualche parte!"
        cover_url=get_cover(page, data_indice)
    else:
        cover_url=""
    out.writerow([unicode(title), "E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=it&format=epub&page=" + unicode(book[0]), None, unicode(data_argomento), unicode(author), None, None, u"Wikisource, la biblioteca libera. <it.wikisource.org>", None, u"HTML | EPUB", unicode(date) + " | " + unicode(publisher)  + " | " + unicode(place), u"Italiano", u"http://it.wikisource.org/wiki/"+ unicode(book[0]), None, u"Pubblico dominio", unicode(cover_url), u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"])


#I read the books from an input CSV.
#of course every file is OK, but the important data is the ns0 title

books=unicodecsv.reader(open("listalibri.csv"), encoding="utf-8")
for book in books:
    get_bookmetadata(book)

print "everything is ok"

"""
it is probably possible to write rewrite everything using Pywikibot and parsing the templates...
"""
	#!/usr/bin/env python

	import requests
	from bs4 import BeautifulSoup
	import unicodecsv

	#uses unicodecsv for generating a CSV. I'm using python 2.7
	#if you use Python 3.X, go with "csv" and change the related instructions accordingly.
	#Beware of
	# csv.writer(open("FILE.csv"), "wt")
	#otherwise it gives you an error.

	# create output CSV file
	out = unicodecsv.writer(open("listametadati.csv", "wb"), encoding="utf-8")
	# write HEAD
	head = ("TITLE", "TYPE", "URL", "CCE", "SUBJECT", "CREATOR", "CONTRIBUTOR", "DESCRIPTION", "PUBLISHER", "DATE", "FORMAT", "SOURCE", "LANGUAGE", "RELATION", "COVERAGE", "RIGHTS", "IMMAGINE", "IMMAGINE ED.")
	out.writerow(head)

	# script for retrieving the cover URL from the Index page
	def get_cover(page, data_indice):
	#page = wikipedia.page(book)
	page_cover = requests.get("http://it.wikisource.org/wiki/" + data_indice)
	soup_cover = BeautifulSoup(page_cover.text)
	cover_line=soup_cover.find("img", { "class" : "thumbimage" })
	if cover_line is not None:
	cover_semiurl = cover_line['src']
	cover_url = "http:" + str(cover_semiurl)
	else:
	cover_url = ""
	return cover_url

	#script for getting all the metadata.
	#it basically scrapes them using the microformat using BeautifulSoup

	def get_bookmetadata(book):
	#get the page HTML
	titolo = "http://it.wikisource.org/wiki/" + book[0]
	print titolo
	page = requests.get(titolo)
	#make it a 'soup'
	soup = BeautifulSoup(page.text)
	data = soup.find("span", { "id" : "dati" })
	#find topic
	data_argomento =data['data-argomento']
	#find Index URL
	data_indice = data['data-urldellaversionecartaceaafronte']
	print data_indice

	#find all the metadata
	try:
	titles = soup.find ('span', attrs={"id" : "ws-title"}) #se vede titoli troppo lunghi, prende quello della pagina in ns0 e non quello con l'a capo
	except HTMLParser.HTMLParseError:
	title = ""
	try:
	authors = soup.find ('span', attrs={"id" : "ws-author"})
	except HTMLParser.HTMLParseError:
	author = ""
	try:
	publishers = soup.find ('span', attrs={"id" : "ws-publisher"})
	except HTMLParser.HTMLParseError:
	publisher = ""
	try:
	dates = soup.find ('span', attrs={"id" : "ws-year"})
	except HTMLParser.HTMLParseError:
	date = ""
	try:
	places = soup.find ('span', attrs={"id" : "ws-place"})
	except HTMLParser.HTMLParseError:
	place = ""

	#this was probably an attempt to generate the Index cover URL...

	"""
	covers=soup.find ('span', attrs={"data-urldellaversionecartaceaafronte"})
	try:
	cover_url = "http:" + get_cover()
	except HTMLParser.HTMLParseError:
	cover_url=""
	"""
	if titles is not None:
	title=titles.text#.encode('ascii', 'ignore')
	if authors is not None:
	author=authors.text#.encode('ascii', 'ignore')
	if publishers is not None:
	publisher=publishers.text#.encode('ascii', 'ignore')
	if dates is not None:
	date=dates.text#.encode('ascii', 'ignore')
	if places is not None:
	place=places.text#.encode('ascii', 'ignore')
	if data_indice is not None:
	#print "Abbiamo la versione testuale da qualche parte!"
	cover_url=get_cover(page, data_indice)
	else:
	cover_url=""
	out.writerow([unicode(title), "E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=it&format=epub&page=" + unicode(book[0]), None, unicode(data_argomento), unicode(author), None, None, u"Wikisource, la biblioteca libera. <it.wikisource.org>", None, u"HTML \| EPUB", unicode(date) + " \| " + unicode(publisher) + " \| " + unicode(place), u"Italiano", u"http://it.wikisource.org/wiki/"+ unicode(book[0]), None, u"Pubblico dominio", unicode(cover_url), u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"])


	#I read the books from an input CSV.
	#of course every file is OK, but the important data is the ns0 title

	books=unicodecsv.reader(open("listalibri.csv"), encoding="utf-8")
	for book in books:
	get_bookmetadata(book)

	print "everything is ok"

	"""
	it is probably possible to write rewrite everything using Pywikibot and parsing the templates...
	"""