firm1/puller.py Secret

## puller.py
import requests
from lxml import etree
import bs4 as BeautifulSoup
import csv

URL_SITEMAP_OPINION = "https://beta.zestedesavoir.com/sitemap-opinions.xml"
URL_SITEMAP_TUTO = "https://beta.zestedesavoir.com/sitemap-tutos.xml"
URL_SITEMAP_ARTICLE = "https://beta.zestedesavoir.com/sitemap-articles.xml"

r_opinion = requests.get(URL_SITEMAP_OPINION)
r_tuto = requests.get(URL_SITEMAP_TUTO)
r_article = requests.get(URL_SITEMAP_ARTICLE)

def get_list_content(html_sitemap_content):
  parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
  root = etree.fromstring(html_sitemap_content.encode('utf-8'), parser=parser)

  contents = []
  for child in root:
    location = etree.SubElement(child, "loc")
    contents.append(child[0].text)
  return contents

def process_url(url):
  r = requests.get(url)
  soup = BeautifulSoup.BeautifulSoup(r.text, "lxml")
  metadata = soup.body.find('article').find('header')
  meta = {}

  meta['license'] = metadata.find('span', attrs={"itemprop": "license"}).text.strip()
  meta['pubdate'] = metadata.find('span', attrs={"class": "pubdate"}).find('time').attrs['datetime']
  authors = []
  for mark in metadata.findAll('a', attrs={"itemprop": "author"}):
    authors.append(mark.find('span', attrs={"itemprop": "name"}).text.strip())
  meta['authors'] = ','.join(authors)

  categories = []
  span_cat = metadata.find('span', attrs={"class": "authors-label"}).find_next('span', attrs={"class": "authors-label"})
  for mark in span_cat.find_next('ul').findAll('a'):
    categories.append(mark.text.strip())
  meta['categories'] = ','.join(categories)

  meta['title'] = metadata.find('h1', attrs={"itemprop": "name"}).contents[-1].strip()

  return meta

def process_contents(r_type, dest_file):
  with open(dest_file, "w") as csvfile:
    fieldnames = ['title', 'pubdate', 'authors', 'categories', 'license']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=";")
    writer.writeheader()

    urls = get_list_content(r_type.text)
    for url in urls:
      try:
        meta = process_url(url)
        writer.writerow(meta)
      except:
        print("ERROR on : "+url)

print("Analyse des opinions")
process_contents(r_opinion, "meta_opinions.csv")
print("Analyse des tutoriels")
process_contents(r_tuto, "meta_tutos.csv")
print("Analyse des articles")
process_contents(r_article, "meta_aticles.csv")
print("Terminé")
	import requests
	from lxml import etree
	import bs4 as BeautifulSoup
	import csv

	URL_SITEMAP_OPINION = "https://beta.zestedesavoir.com/sitemap-opinions.xml"
	URL_SITEMAP_TUTO = "https://beta.zestedesavoir.com/sitemap-tutos.xml"
	URL_SITEMAP_ARTICLE = "https://beta.zestedesavoir.com/sitemap-articles.xml"

	r_opinion = requests.get(URL_SITEMAP_OPINION)
	r_tuto = requests.get(URL_SITEMAP_TUTO)
	r_article = requests.get(URL_SITEMAP_ARTICLE)

	def get_list_content(html_sitemap_content):
	parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
	root = etree.fromstring(html_sitemap_content.encode('utf-8'), parser=parser)

	contents = []
	for child in root:
	location = etree.SubElement(child, "loc")
	contents.append(child[0].text)
	return contents

	def process_url(url):
	r = requests.get(url)
	soup = BeautifulSoup.BeautifulSoup(r.text, "lxml")
	metadata = soup.body.find('article').find('header')
	meta = {}

	meta['license'] = metadata.find('span', attrs={"itemprop": "license"}).text.strip()
	meta['pubdate'] = metadata.find('span', attrs={"class": "pubdate"}).find('time').attrs['datetime']
	authors = []
	for mark in metadata.findAll('a', attrs={"itemprop": "author"}):
	authors.append(mark.find('span', attrs={"itemprop": "name"}).text.strip())
	meta['authors'] = ','.join(authors)

	categories = []
	span_cat = metadata.find('span', attrs={"class": "authors-label"}).find_next('span', attrs={"class": "authors-label"})
	for mark in span_cat.find_next('ul').findAll('a'):
	categories.append(mark.text.strip())
	meta['categories'] = ','.join(categories)

	meta['title'] = metadata.find('h1', attrs={"itemprop": "name"}).contents[-1].strip()

	return meta

	def process_contents(r_type, dest_file):
	with open(dest_file, "w") as csvfile:
	fieldnames = ['title', 'pubdate', 'authors', 'categories', 'license']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=";")
	writer.writeheader()

	urls = get_list_content(r_type.text)
	for url in urls:
	try:
	meta = process_url(url)
	writer.writerow(meta)
	except:
	print("ERROR on : "+url)

	print("Analyse des opinions")
	process_contents(r_opinion, "meta_opinions.csv")
	print("Analyse des tutoriels")
	process_contents(r_tuto, "meta_tutos.csv")
	print("Analyse des articles")
	process_contents(r_article, "meta_aticles.csv")
	print("Terminé")