Created
December 19, 2017 12:58
-
-
Save ebergam/3eb063a5d53113906541d0b7d20636cf to your computer and use it in GitHub Desktop.
Scrape and feed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2, datetime | |
from lxml import etree | |
from yattag import Doc | |
url = "http://enricobergamini.it/scrape_feed_tutorial.html" | |
#this functions transforms a date written as '03/24/2017' into an RSS valid 'Fri, 24 Mar 2017 00:00:00 +0200' | |
def clean_date(x): | |
try: | |
d = datetime.datetime.strptime(x.text, '%m/%d/%Y') | |
clean_d = d.strftime("%a, %d %b %Y %H:%M:%S %z +0200") | |
return clean_d | |
except Exception as e: | |
print e | |
clean_d = datetime.date.today().strftime("%a, %d %b %Y %H:%M:%S %z +0200"), | |
return clean_d | |
#load the page | |
response = urllib2.urlopen(url) | |
resp_data = response.read() | |
page = etree.HTML(resp_data) | |
response.close() | |
#define lists and find the HTML tags with the data, it will return variables as lists of elements found | |
list_of_titles = [] | |
list_of_dates = [] | |
list_of_hrefs = [] | |
titles_tags = page.findall('.//h1') | |
pubDate_tags = page.findall('.//p') | |
href_tags = page.findall('.//a') | |
# Loop around the obtained lists and scrape data from the tags to text. | |
# I've build three slightly different and didactic examples on how to work on data: | |
# * extract text for the title | |
# * clean the date to a correct RSS format with the function above 'clean_date' | |
# * And extract a link which was a tag's attribute | |
for title in titles_tags: | |
list_of_titles.append(title.text) | |
for date in pubDate_tags: | |
list_of_dates.append(clean_date(date)) | |
for link in href_tags: | |
list_of_hrefs.append(link.get('href')) | |
#create a list of lists for with all the entries | |
raw_datalist = zip(list_of_titles, list_of_dates, list_of_hrefs) | |
print raw_datalist | |
#It's really easy to write correct XML/HTML with Yattag | |
#What you're doing here is basically writing an HTML document as an output, using the raw_datalist in the small loop inside to create new lines | |
def generate_feed(): | |
doc, tag, text, line = Doc().ttl() | |
doc.asis('<?xml version="1.0" encoding="UTF-8"?>') | |
with tag('rss', | |
('xmlns:atom', 'http://www.w3.org/2005/Atom'), | |
('version', '2.0') | |
): | |
with tag('channel'): | |
line('title', 'Your Title - example') | |
line('link', 'http://enricobergamini.it/feed') | |
line('description', 'description of the feed') | |
line('language', 'en') | |
for row in raw_datalist: | |
with tag('item'): | |
line('title', row[0]) | |
line('pubDate', row[1]) | |
line('link', row[2]) | |
line('description', row[0]) | |
with open('your_feed.xml','wf') as f: | |
f.write(doc.getvalue()) | |
generate_feed() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment