ebergam/scrape_n_feed.py

## scrape_n_feed.py
import urllib2, datetime
from lxml import etree
from yattag import Doc

url = "http://enricobergamini.it/scrape_feed_tutorial.html"

#this functions transforms a date written as '03/24/2017' into an RSS valid 'Fri, 24 Mar 2017 00:00:00 +0200'
def clean_date(x):
	try:
		d = datetime.datetime.strptime(x.text, '%m/%d/%Y')
		clean_d = d.strftime("%a, %d %b %Y %H:%M:%S %z +0200")
		return clean_d
	except Exception as e:
		print e
		clean_d = datetime.date.today().strftime("%a, %d %b %Y %H:%M:%S %z +0200"),
		return clean_d

#load the page
response = urllib2.urlopen(url)
resp_data = response.read()
page = etree.HTML(resp_data)
response.close()

#define lists and find the HTML tags with the data, it will return variables as lists of elements found
list_of_titles = []
list_of_dates = []
list_of_hrefs = []

titles_tags = page.findall('.//h1')
pubDate_tags = page.findall('.//p')
href_tags = page.findall('.//a')

# Loop around the obtained lists and scrape data from the tags to text.
# I've build three slightly different and didactic examples on how to work on data:
# * extract text for the title
# * clean the date to a correct RSS format with the function above 'clean_date'
# * And extract a link which was a tag's attribute

for title in titles_tags:
	list_of_titles.append(title.text)

for date in pubDate_tags:
	list_of_dates.append(clean_date(date))

for link in href_tags:
	list_of_hrefs.append(link.get('href'))

#create a list of lists for with all the entries

raw_datalist = zip(list_of_titles, list_of_dates, list_of_hrefs)
print raw_datalist

#It's really easy to write correct XML/HTML with Yattag
#What you're doing here is basically writing an HTML document as an output, using the raw_datalist in the small loop inside to create new lines
def generate_feed():
			doc, tag, text, line = Doc().ttl()
			doc.asis('<?xml version="1.0" encoding="UTF-8"?>')
			with tag('rss',
				('xmlns:atom', 'http://www.w3.org/2005/Atom'),
				('version', '2.0')
				):
					with tag('channel'):
						line('title', 'Your Title - example')
						line('link', 'http://enricobergamini.it/feed')
						line('description', 'description of the feed')
						line('language', 'en')
						for row in raw_datalist:
			   				with tag('item'):
			   					line('title', row[0])
			   					line('pubDate', row[1])
			   					line('link', row[2])
			   					line('description', row[0])
		 	with open('your_feed.xml','wf') as f:
		 		f.write(doc.getvalue())
generate_feed()
	import urllib2, datetime
	from lxml import etree
	from yattag import Doc

	url = "http://enricobergamini.it/scrape_feed_tutorial.html"

	#this functions transforms a date written as '03/24/2017' into an RSS valid 'Fri, 24 Mar 2017 00:00:00 +0200'
	def clean_date(x):
	try:
	d = datetime.datetime.strptime(x.text, '%m/%d/%Y')
	clean_d = d.strftime("%a, %d %b %Y %H:%M:%S %z +0200")
	return clean_d
	except Exception as e:
	print e
	clean_d = datetime.date.today().strftime("%a, %d %b %Y %H:%M:%S %z +0200"),
	return clean_d

	#load the page
	response = urllib2.urlopen(url)
	resp_data = response.read()
	page = etree.HTML(resp_data)
	response.close()

	#define lists and find the HTML tags with the data, it will return variables as lists of elements found
	list_of_titles = []
	list_of_dates = []
	list_of_hrefs = []

	titles_tags = page.findall('.//h1')
	pubDate_tags = page.findall('.//p')
	href_tags = page.findall('.//a')

	# Loop around the obtained lists and scrape data from the tags to text.
	# I've build three slightly different and didactic examples on how to work on data:
	# * extract text for the title
	# * clean the date to a correct RSS format with the function above 'clean_date'
	# * And extract a link which was a tag's attribute

	for title in titles_tags:
	list_of_titles.append(title.text)

	for date in pubDate_tags:
	list_of_dates.append(clean_date(date))

	for link in href_tags:
	list_of_hrefs.append(link.get('href'))

	#create a list of lists for with all the entries

	raw_datalist = zip(list_of_titles, list_of_dates, list_of_hrefs)
	print raw_datalist

	#It's really easy to write correct XML/HTML with Yattag
	#What you're doing here is basically writing an HTML document as an output, using the raw_datalist in the small loop inside to create new lines
	def generate_feed():
	doc, tag, text, line = Doc().ttl()
	doc.asis('<?xml version="1.0" encoding="UTF-8"?>')
	with tag('rss',
	('xmlns:atom', 'http://www.w3.org/2005/Atom'),
	('version', '2.0')
	):
	with tag('channel'):
	line('title', 'Your Title - example')
	line('link', 'http://enricobergamini.it/feed')
	line('description', 'description of the feed')
	line('language', 'en')
	for row in raw_datalist:
	with tag('item'):
	line('title', row[0])
	line('pubDate', row[1])
	line('link', row[2])
	line('description', row[0])
	with open('your_feed.xml','wf') as f:
	f.write(doc.getvalue())
	generate_feed()