brunifrancesco/scraper.py

## scraper.py
# -*- coding: utf-8 -*-

"""
Get Paris attractions data scraping the BASE_URL site.
Wrap those data into a json structure.

Deps:
 - BeautifulSoup
 - requests
 - fn

"""

from bs4 import BeautifulSoup
import json
import requests
from collections import deque
from fn.monad import Option
import unittest


BASE_URL = "http://www.parigi.it/it/cosa_vedere_a_parigi.php"


class ScrapedData:
    """
    Dummy container to store informations retrieved from scraped content
    """
    def __init__(self, img, subtitle, text, other_infos, correlated):
        self.img = img
        self.subtitle = subtitle
        self.text = text
        self.other_infos = other_infos
        self.correlated = correlated

    def to_dict(self):
        return self.__dict__

    def __str__(self):
        return self.subtitle


def do_basic_search():
    """
    Scrape the main page to get attraction page urls;
    process each retrieved url
    """

    print("Scraping main page")
    soup = Option(requests.get(BASE_URL)).map(lambda result: result.text).map(lambda html: BeautifulSoup(html, "html.parser")).get_or("")
    return [scrape_content(url) for url in  soup.findAll("div", {"class": "paragrafo_correlazioni_box"})]


def scrape_content(element):
    """
    Scrape the single page getting relevant information
    and wrap them in a ScrapedData instance
    """
    print("Processing %s" % element.find("a").attrs["title"])
    try:
        data = requests.get(element.find("a").attrs["href"]).text
        soup = BeautifulSoup(data, "html.parser").find("div", {"class":"centro"})
        return ScrapedData(
            img = Option(soup.find("div", {"class": "view"})).map(lambda div: div.find("a")).map(lambda link: link.attrs["href"]).get_or("Image not available"),
            subtitle = Option(soup.find("div", {"class": "paragrafo_1_testo"})).map(lambda div: div.find("h3")).map(lambda result: str(result)).get_or("Content not available"),
            text = Option(soup.find("div", {"class": "paragrafo_1_testo"})).map(lambda item: item.find("p")).map(lambda result: str(result)).get_or("Content not available"),
            other_infos = str("<br />".join((str(item) for item in soup.findAll("table", {"class":"tabella_contenuti"})))),
            correlated = map(lambda element: element.find("a").attrs.get("title", ""), soup.findAll("div", {"class":"paragrafo_correlazioni_box"}))
            )

    except Exception as e:
        print(e)
        print("error in processing %s" %element.find("a").attrs["href"])

class TestScraping(unittest.TestCase):

    def test_scraping(self):
        result = do_basic_search()
        assert result
        self.assertTrue(len(result) > 2)
        print(
            Option(map(lambda item: item.to_dict(), result)).
            map(lambda lst: dict(result=lst)).
            map(lambda result: json.dumps(result)).
            get_or(json.dumps(dict(result="No data available"))))

if __name__ == '__main__':
    unittest.main()
	# -- coding: utf-8 --

	"""
	Get Paris attractions data scraping the BASE_URL site.
	Wrap those data into a json structure.

	Deps:
	- BeautifulSoup
	- requests
	- fn

	"""

	from bs4 import BeautifulSoup
	import json
	import requests
	from collections import deque
	from fn.monad import Option
	import unittest



	BASE_URL = "http://www.parigi.it/it/cosa_vedere_a_parigi.php"


	class ScrapedData:
	"""
	Dummy container to store informations retrieved from scraped content
	"""
	def __init__(self, img, subtitle, text, other_infos, correlated):
	self.img = img
	self.subtitle = subtitle
	self.text = text
	self.other_infos = other_infos
	self.correlated = correlated

	def to_dict(self):
	return self.__dict__

	def __str__(self):
	return self.subtitle



	def do_basic_search():
	"""
	Scrape the main page to get attraction page urls;
	process each retrieved url
	"""

	print("Scraping main page")
	soup = Option(requests.get(BASE_URL)).map(lambda result: result.text).map(lambda html: BeautifulSoup(html, "html.parser")).get_or("")
	return [scrape_content(url) for url in soup.findAll("div", {"class": "paragrafo_correlazioni_box"})]


	def scrape_content(element):
	"""
	Scrape the single page getting relevant information
	and wrap them in a ScrapedData instance
	"""
	print("Processing %s" % element.find("a").attrs["title"])
	try:
	data = requests.get(element.find("a").attrs["href"]).text
	soup = BeautifulSoup(data, "html.parser").find("div", {"class":"centro"})
	return ScrapedData(
	img = Option(soup.find("div", {"class": "view"})).map(lambda div: div.find("a")).map(lambda link: link.attrs["href"]).get_or("Image not available"),
	subtitle = Option(soup.find("div", {"class": "paragrafo_1_testo"})).map(lambda div: div.find("h3")).map(lambda result: str(result)).get_or("Content not available"),
	text = Option(soup.find("div", {"class": "paragrafo_1_testo"})).map(lambda item: item.find("p")).map(lambda result: str(result)).get_or("Content not available"),
	other_infos = str("<br />".join((str(item) for item in soup.findAll("table", {"class":"tabella_contenuti"})))),
	correlated = map(lambda element: element.find("a").attrs.get("title", ""), soup.findAll("div", {"class":"paragrafo_correlazioni_box"}))
	)

	except Exception as e:
	print(e)
	print("error in processing %s" %element.find("a").attrs["href"])

	class TestScraping(unittest.TestCase):

	def test_scraping(self):
	result = do_basic_search()
	assert result
	self.assertTrue(len(result) > 2)
	print(
	Option(map(lambda item: item.to_dict(), result)).
	map(lambda lst: dict(result=lst)).
	map(lambda result: json.dumps(result)).
	get_or(json.dumps(dict(result="No data available"))))

	if __name__ == '__main__':
	unittest.main()