Skip to content

Instantly share code, notes, and snippets.

@ebergam
Created December 19, 2017 12:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ebergam/3eb063a5d53113906541d0b7d20636cf to your computer and use it in GitHub Desktop.
Save ebergam/3eb063a5d53113906541d0b7d20636cf to your computer and use it in GitHub Desktop.
Scrape and feed
import urllib2, datetime
from lxml import etree
from yattag import Doc
url = "http://enricobergamini.it/scrape_feed_tutorial.html"
#this functions transforms a date written as '03/24/2017' into an RSS valid 'Fri, 24 Mar 2017 00:00:00 +0200'
def clean_date(x):
try:
d = datetime.datetime.strptime(x.text, '%m/%d/%Y')
clean_d = d.strftime("%a, %d %b %Y %H:%M:%S %z +0200")
return clean_d
except Exception as e:
print e
clean_d = datetime.date.today().strftime("%a, %d %b %Y %H:%M:%S %z +0200"),
return clean_d
#load the page
response = urllib2.urlopen(url)
resp_data = response.read()
page = etree.HTML(resp_data)
response.close()
#define lists and find the HTML tags with the data, it will return variables as lists of elements found
list_of_titles = []
list_of_dates = []
list_of_hrefs = []
titles_tags = page.findall('.//h1')
pubDate_tags = page.findall('.//p')
href_tags = page.findall('.//a')
# Loop around the obtained lists and scrape data from the tags to text.
# I've build three slightly different and didactic examples on how to work on data:
# * extract text for the title
# * clean the date to a correct RSS format with the function above 'clean_date'
# * And extract a link which was a tag's attribute
for title in titles_tags:
list_of_titles.append(title.text)
for date in pubDate_tags:
list_of_dates.append(clean_date(date))
for link in href_tags:
list_of_hrefs.append(link.get('href'))
#create a list of lists for with all the entries
raw_datalist = zip(list_of_titles, list_of_dates, list_of_hrefs)
print raw_datalist
#It's really easy to write correct XML/HTML with Yattag
#What you're doing here is basically writing an HTML document as an output, using the raw_datalist in the small loop inside to create new lines
def generate_feed():
doc, tag, text, line = Doc().ttl()
doc.asis('<?xml version="1.0" encoding="UTF-8"?>')
with tag('rss',
('xmlns:atom', 'http://www.w3.org/2005/Atom'),
('version', '2.0')
):
with tag('channel'):
line('title', 'Your Title - example')
line('link', 'http://enricobergamini.it/feed')
line('description', 'description of the feed')
line('language', 'en')
for row in raw_datalist:
with tag('item'):
line('title', row[0])
line('pubDate', row[1])
line('link', row[2])
line('description', row[0])
with open('your_feed.xml','wf') as f:
f.write(doc.getvalue())
generate_feed()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment