Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrape and feed
import urllib2, datetime
from lxml import etree
from yattag import Doc
url = "http://enricobergamini.it/scrape_feed_tutorial.html"
#this functions transforms a date written as '03/24/2017' into an RSS valid 'Fri, 24 Mar 2017 00:00:00 +0200'
def clean_date(x):
try:
d = datetime.datetime.strptime(x.text, '%m/%d/%Y')
clean_d = d.strftime("%a, %d %b %Y %H:%M:%S %z +0200")
return clean_d
except Exception as e:
print e
clean_d = datetime.date.today().strftime("%a, %d %b %Y %H:%M:%S %z +0200"),
return clean_d
#load the page
response = urllib2.urlopen(url)
resp_data = response.read()
page = etree.HTML(resp_data)
response.close()
#define lists and find the HTML tags with the data, it will return variables as lists of elements found
list_of_titles = []
list_of_dates = []
list_of_hrefs = []
titles_tags = page.findall('.//h1')
pubDate_tags = page.findall('.//p')
href_tags = page.findall('.//a')
# Loop around the obtained lists and scrape data from the tags to text.
# I've build three slightly different and didactic examples on how to work on data:
# * extract text for the title
# * clean the date to a correct RSS format with the function above 'clean_date'
# * And extract a link which was a tag's attribute
for title in titles_tags:
list_of_titles.append(title.text)
for date in pubDate_tags:
list_of_dates.append(clean_date(date))
for link in href_tags:
list_of_hrefs.append(link.get('href'))
#create a list of lists for with all the entries
raw_datalist = zip(list_of_titles, list_of_dates, list_of_hrefs)
print raw_datalist
#It's really easy to write correct XML/HTML with Yattag
#What you're doing here is basically writing an HTML document as an output, using the raw_datalist in the small loop inside to create new lines
def generate_feed():
doc, tag, text, line = Doc().ttl()
doc.asis('<?xml version="1.0" encoding="UTF-8"?>')
with tag('rss',
('xmlns:atom', 'http://www.w3.org/2005/Atom'),
('version', '2.0')
):
with tag('channel'):
line('title', 'Your Title - example')
line('link', 'http://enricobergamini.it/feed')
line('description', 'description of the feed')
line('language', 'en')
for row in raw_datalist:
with tag('item'):
line('title', row[0])
line('pubDate', row[1])
line('link', row[2])
line('description', row[0])
with open('your_feed.xml','wf') as f:
f.write(doc.getvalue())
generate_feed()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.