Aubreymcfato/enws_scraper.py

## enws_scraper.py
import requests
from bs4 import BeautifulSoup
import unicodecsv
import re


#uses unicodecsv for generating a CSV. I'm using python 2.7
#if you use Python 3.X, go with "csv" and change the related instructions accordingly.
#Beware of
#    csv.writer(open("FILE.csv"), "wt")
#otherwise it gives you an error.

# create output CSV file

out = unicodecsv.writer(open("en_metadata.csv", "wb"), encoding="utf-8")
head = ("TITLE", "TYPE", "URL", "CCE", "SUBJECT", "CREATOR", "CONTRIBUTOR", "DESCRIPTION", "PUBLISHER", "DATE", "FORMAT", "SOURCE", "LANGUAGE", "RELATION", "COVERAGE", "RIGHTS", "IMMAGINE", "IMMAGINE ED.")
out.writerow(head)

# script for retrieving the metadata from the Index page
# bit different from itws_scraper.py: here I'm using regex
# and also I'm taking many other data which are not in the ns0
def get_index_metadata(index):
    rr = requests.get("https://en.wikisource.org/wiki/" + index)
    #if it were the only image, this would do:    image= soup_cover.img['src']
    try:
        image= re.search(r"//upload\.wikimedia\.org/(.*?)\.djvu\.jpg", rr.text)
        cover_url= "https:" + image.group(0)
    except AttributeError:
        cover_url = None

    data = BeautifulSoup(rr.text)

    try:
        publishers = data.find('td', attrs={"id" : "ws-publisher"})
        publisher = publishers.text
    except AttributeError:
        publisher = None
    try:
        places = data.find ('td', attrs={"id" : "ws-place"})
        place=places.text
    except AttributeError:
        place = None
    try:
        dates = data.find ('td', attrs={"id" : "ws-year"})
        date = dates.text
    except AttributeError:
        date = None

    return cover_url, publisher, place, date

#script for getting all the metadata.
#it basically scrapes them using the microformat using BeautifulSoup

def get_bookmetadata(book):
    book = book.strip()
    book_url = "https://en.wikisource.org/wiki/" + book
    r = requests.get(book_url)
    soup = BeautifulSoup(r.text)
    try:
        titles = soup.find ('span', attrs={"id" : "ws-title"}) #I think there was a problem with titles too long (they break)
        title=titles.text
    except AttributeError:
        title = None
    try:
        authors = soup.find ('span', attrs={"id" : "ws-author"})
        author = authors.text
    except AttributeError:
        author = None

    try:
        years = soup.find ('span', attrs={"id" : "ws-year"})
        year = years.text
    except AttributeError:
        year = None


    i=re.search(r"Index:(.*)\.djvu", r.text)

    #not all the books have the Index, here's the YES option
    if i is not None:
        index= i.group(0)
        metadata=get_index_metadata(index)
        cover_url=metadata[0]
        publisher = metadata[1]
        place = metadata[2]
        date = metadata[3]
    else:
        index = None
        cover_url=None
        publisher = None
        place = None
        date = None

    #not sure, but I think there was an issue for this script to work for both books: with and without Index page.
    #Here I wrote 2 different instructions for the output CSV

    #without Source
    #out.writerow([unicode(title), "E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=en&format=epub&page=" + unicode(book), None, None, unicode(author), None, None, u"Wikisource, la biblioteca libera. <en.wikisource.org>", None, u"HTML | EPUB", unicode(date), u"inglese", u"http://en.wikisource.org/wiki/"+ unicode(book), None, u"Pubblico dominio", "cover_url", u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"])

    #with Source
    out.writerow([unicode(title),"E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=en&format=epub&page=" + unicode(book), None, None, unicode(author), None, None, u"Wikisource, la biblioteca libera. <en.wikisource.org>", year, u"HTML | EPUB", unicode(date) + " | " + unicode(publisher) + " | " + unicode(place), u"inglese", u"http://en.wikisource.org/wiki/"+ book, None, u"Pubblico dominio", cover_url, u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"])

#I read the books from an input list.
#of course every file (CSV, etc.) is OK, but the important data is the ns0 title
with open("en_list") as books:
    for book in books:
        get_bookmetadata(book)
	import requests
	from bs4 import BeautifulSoup
	import unicodecsv
	import re


	#uses unicodecsv for generating a CSV. I'm using python 2.7
	#if you use Python 3.X, go with "csv" and change the related instructions accordingly.
	#Beware of
	# csv.writer(open("FILE.csv"), "wt")
	#otherwise it gives you an error.

	# create output CSV file

	out = unicodecsv.writer(open("en_metadata.csv", "wb"), encoding="utf-8")
	head = ("TITLE", "TYPE", "URL", "CCE", "SUBJECT", "CREATOR", "CONTRIBUTOR", "DESCRIPTION", "PUBLISHER", "DATE", "FORMAT", "SOURCE", "LANGUAGE", "RELATION", "COVERAGE", "RIGHTS", "IMMAGINE", "IMMAGINE ED.")
	out.writerow(head)

	# script for retrieving the metadata from the Index page
	# bit different from itws_scraper.py: here I'm using regex
	# and also I'm taking many other data which are not in the ns0
	def get_index_metadata(index):
	rr = requests.get("https://en.wikisource.org/wiki/" + index)
	#if it were the only image, this would do: image= soup_cover.img['src']
	try:
	image= re.search(r"//upload\.wikimedia\.org/(.*?)\.djvu\.jpg", rr.text)
	cover_url= "https:" + image.group(0)
	except AttributeError:
	cover_url = None

	data = BeautifulSoup(rr.text)

	try:
	publishers = data.find('td', attrs={"id" : "ws-publisher"})
	publisher = publishers.text
	except AttributeError:
	publisher = None
	try:
	places = data.find ('td', attrs={"id" : "ws-place"})
	place=places.text
	except AttributeError:
	place = None
	try:
	dates = data.find ('td', attrs={"id" : "ws-year"})
	date = dates.text
	except AttributeError:
	date = None

	return cover_url, publisher, place, date

	#script for getting all the metadata.
	#it basically scrapes them using the microformat using BeautifulSoup

	def get_bookmetadata(book):
	book = book.strip()
	book_url = "https://en.wikisource.org/wiki/" + book
	r = requests.get(book_url)
	soup = BeautifulSoup(r.text)
	try:
	titles = soup.find ('span', attrs={"id" : "ws-title"}) #I think there was a problem with titles too long (they break)
	title=titles.text
	except AttributeError:
	title = None
	try:
	authors = soup.find ('span', attrs={"id" : "ws-author"})
	author = authors.text
	except AttributeError:
	author = None

	try:
	years = soup.find ('span', attrs={"id" : "ws-year"})
	year = years.text
	except AttributeError:
	year = None


	i=re.search(r"Index:(.*)\.djvu", r.text)

	#not all the books have the Index, here's the YES option
	if i is not None:
	index= i.group(0)
	metadata=get_index_metadata(index)
	cover_url=metadata[0]
	publisher = metadata[1]
	place = metadata[2]
	date = metadata[3]
	else:
	index = None
	cover_url=None
	publisher = None
	place = None
	date = None

	#not sure, but I think there was an issue for this script to work for both books: with and without Index page.
	#Here I wrote 2 different instructions for the output CSV

	#without Source
	#out.writerow([unicode(title), "E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=en&format=epub&page=" + unicode(book), None, None, unicode(author), None, None, u"Wikisource, la biblioteca libera. <en.wikisource.org>", None, u"HTML \| EPUB", unicode(date), u"inglese", u"http://en.wikisource.org/wiki/"+ unicode(book), None, u"Pubblico dominio", "cover_url", u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"])

	#with Source
	out.writerow([unicode(title),"E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=en&format=epub&page=" + unicode(book), None, None, unicode(author), None, None, u"Wikisource, la biblioteca libera. <en.wikisource.org>", year, u"HTML \| EPUB", unicode(date) + " \| " + unicode(publisher) + " \| " + unicode(place), u"inglese", u"http://en.wikisource.org/wiki/"+ book, None, u"Pubblico dominio", cover_url, u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"])

	#I read the books from an input list.
	#of course every file (CSV, etc.) is OK, but the important data is the ns0 title
	with open("en_list") as books:
	for book in books:
	get_bookmetadata(book)