Skip to content

Instantly share code, notes, and snippets.

@tigerwang202
Created March 2, 2012 11:35
Show Gist options
  • Save tigerwang202/1957916 to your computer and use it in GitHub Desktop.
Save tigerwang202/1957916 to your computer and use it in GitHub Desktop.
Fetch VOA Special English RSS feed & convert it to html file
# -*- coding: utf-8 -*-
# Fetch VOA Special English
# wangmengyin 2012-02-26
import urllib.request
import urllib.error
from xml.dom import minidom, Node
from os import system, path
import os
import re
import sys
from subprocess import call
import pickle
localmp3 = False
localjpg = False
#rss_link = 'http://www.voanews.com/templates/Articles.rss?' \
# 'sectionPath=/learningenglish/home'
rss_link = 'http://rss2proxy.appspot.com'
proxy_url = 'http://203.208.46.1:80/' # ip of google.cn
# article content page template
template_file = 'voa_special_english_content.html'
def main():
# clear buffer files
clearBuf()
# read html template file
template = readtemplateFile('rss', template_file)
# fetch rss via proxy
print('Fetch rss seed, via {%s} proxy, wait.' %proxy_url)
proxies = {'http': proxy_url}
opener = urllib.request.FancyURLopener(proxies)
link = opener.open(rss_link)
# parse rss content
print('Parse content')
articles = []
dom = minidom.parse(link)
page_num = 0
for node in dom.getElementsByTagName('item'):
mp3 = handleMp3(node)
jpg = handlePic(node)
article_content = handleArticleContent(node)
# skip invaild page
if mp3 == '' or jpg == '' or article_content == '':
continue
# parse summary
article = {
'title' : handleText(node, 'title'),
'link' : handleText(node, 'link'),
'pubDate' : handleText(node, 'pubDate'),
'description' : handleText(node, 'description'),
'mp3' : mp3,
'jpg' : jpg
}
articles.append(article)
print('Fecth article[%d]: %s' %(page_num + 1, article['title']))
if localjpg:
# save pic
pathPic = savePic(article['jpg'])
if pathPic != '':
article['jpg'] = pathPic # replace pic with local buffer.
if localmp3:
# save mp3
pathMp3 = saveMp3(article['mp3'])
if pathMp3 != '':
article['mp3'] = pathMp3 # replace mp3 with local buffer.
# generate article page
html = generateArticle(template, article, article_content)
html_path = path.join('rss',
"{0:d}{1}".format(page_num + 1, '.html'))
f = open(html_path, 'w', encoding = 'utf-8')
f.write(html)
f.write
f.close()
page_num = page_num + 1
print('\nGet %d articles, saved to html page.' %page_num)
# generate rss index
f = open('rss.xml', 'w', encoding = 'utf-8')
f.write(genXml(articles))
f.close()
def getText(nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
def handleText(node, s):
return getText(node.getElementsByTagName(s)[0].childNodes)
def handleMp3(node):
elements = node.getElementsByTagName('media:content')
if len(elements) != 0:
for element in elements:
if element.hasAttribute('type'):
return element.getAttribute('url')
return ''
def handlePic(node):
elements = node.getElementsByTagName('enclosure')
if len(elements) != 0:
for element in elements:
if element.hasAttribute('url'):
return element.getAttribute('url')
return ''
def handleArticleContent(node):
# parse embedded html content
elements = node.getElementsByTagName('content:encoded')
if len(elements) != 0:
content = elements[0].toxml()
# format html file
match = re.search(r'</object>', content)
if match:
# 27 = len('</div>]]></content:encoded>')
content = content[match.end(): -27]
return content
return ''
def generateArticle(template, article, article_content):
html = template.format(link = article['link'],
title = article['title'],
pubDate = article['pubDate'],
jpg = article['jpg'],
mp3 = article['mp3'],
content = article_content)
return html
def readtemplateFile(directory, filename):
fp = path.join(os.getcwd(), directory, filename)
print('Read html template file: %s' %fp)
f = open(fp, 'r', encoding = 'utf-8')
template = f.read()
f.close()
return template
#function that downloads a file
def downloadFile(file_name,file_mode,url):
ok = False
# Open the url
try:
print("downloading ", url)
f = urllib.request.urlopen(url)
# Open our local file for writing
local_file = open(file_name, "w" + file_mode)
#Write to our local file
local_file.write(f.read())
local_file.close()
ok = True
#handle errors
except urllib.error.HTTPError as e:
print("HTTP Error:",e.code , url)
except urllib.error.URLError as e:
print("URL Error:",e.reason , url)
return ok
# save pic
def savePic(url):
print('Pic ', end = '')
file_name = path.join(os.getcwd(), 'rss', 'jpg', url.split('/')[-1])
ok = downloadFile(file_name, 'b', url)
if ok:
return path.join('jpg', url.split('/')[-1])
else:
os.remove(file_name)
return ''
# save mp3 using curl or wget, bacause file size is big.
def saveMp3(url):
print('Mp3 downloading %s' %url)
file_name = path.join(os.getcwd(), 'rss', 'mp3', url.split('/')[-1])
ret = call('curl --connect-timeout 50 -o {0} {1}'.format(file_name, url))
if ret != 0:
print('curl Error %d refer to http://linux.die.net/man/1/curl' %ret)
os.remove(file_name)
return ''
else:
return path.join('mp3', url.split('/')[-1])
# clear temp buffer files
def clearBuf():
curdir = os.getcwd()
rmfiles(path.join(curdir, 'rss', 'jpg')) # rm rss/jpg/*.jpg
rmfiles(path.join(curdir, 'rss', 'mp3')) # rm rss/mp3/*.mp3
# rm rss/[0-9].html seed.xml
for root, dirs, files in os.walk(path.join(curdir, 'rss'), topdown=False):
for name in files:
p = os.path.join(root, name)
if re.search(r'[0-9]+.html', p):
os.remove(p)
# rm rss.xml
if os.path.exists(os.path.join(curdir, 'rss.xml')):
os.remove(os.path.join(curdir, 'rss.xml'))
# remove files in curdir
def rmfiles(curdir):
for root, dirs, files in os.walk(curdir, topdown=False):
for name in files:
os.remove(os.path.join(root, name))
# generate article list xml
def genXml(articles):
s = ''
s = s + '<rssfeed>' + '\n'
for article in articles:
s = s + handleNode(article) + '\n'
s = s + '</rssfeed>'
return s
def handleNode(article):
s = ''
s = s + '<item>' + '\n'
for key in article.keys():
s = s + '\t' + handleElement(article, key)
s = s + '</item>' + '\n'
return s
def handleElement(article, i):
return '<{0}>{1}</{0}>\n'.format(i, article[i])
# Standard boilerplate to call the main() function to begin
# the program.
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment