Skip to content

Instantly share code, notes, and snippets.

@firm1 firm1/puller.py Secret
Created Sep 23, 2017

Embed
What would you like to do?
import requests
from lxml import etree
import bs4 as BeautifulSoup
import csv
URL_SITEMAP_OPINION = "https://beta.zestedesavoir.com/sitemap-opinions.xml"
URL_SITEMAP_TUTO = "https://beta.zestedesavoir.com/sitemap-tutos.xml"
URL_SITEMAP_ARTICLE = "https://beta.zestedesavoir.com/sitemap-articles.xml"
r_opinion = requests.get(URL_SITEMAP_OPINION)
r_tuto = requests.get(URL_SITEMAP_TUTO)
r_article = requests.get(URL_SITEMAP_ARTICLE)
def get_list_content(html_sitemap_content):
parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
root = etree.fromstring(html_sitemap_content.encode('utf-8'), parser=parser)
contents = []
for child in root:
location = etree.SubElement(child, "loc")
contents.append(child[0].text)
return contents
def process_url(url):
r = requests.get(url)
soup = BeautifulSoup.BeautifulSoup(r.text, "lxml")
metadata = soup.body.find('article').find('header')
meta = {}
meta['license'] = metadata.find('span', attrs={"itemprop": "license"}).text.strip()
meta['pubdate'] = metadata.find('span', attrs={"class": "pubdate"}).find('time').attrs['datetime']
authors = []
for mark in metadata.findAll('a', attrs={"itemprop": "author"}):
authors.append(mark.find('span', attrs={"itemprop": "name"}).text.strip())
meta['authors'] = ','.join(authors)
categories = []
span_cat = metadata.find('span', attrs={"class": "authors-label"}).find_next('span', attrs={"class": "authors-label"})
for mark in span_cat.find_next('ul').findAll('a'):
categories.append(mark.text.strip())
meta['categories'] = ','.join(categories)
meta['title'] = metadata.find('h1', attrs={"itemprop": "name"}).contents[-1].strip()
return meta
def process_contents(r_type, dest_file):
with open(dest_file, "w") as csvfile:
fieldnames = ['title', 'pubdate', 'authors', 'categories', 'license']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=";")
writer.writeheader()
urls = get_list_content(r_type.text)
for url in urls:
try:
meta = process_url(url)
writer.writerow(meta)
except:
print("ERROR on : "+url)
print("Analyse des opinions")
process_contents(r_opinion, "meta_opinions.csv")
print("Analyse des tutoriels")
process_contents(r_tuto, "meta_tutos.csv")
print("Analyse des articles")
process_contents(r_article, "meta_aticles.csv")
print("Terminé")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.