tigerwang202/fetch_voa_special_english_mp3.py

## fetch_voa_special_english_mp3.py
# -*- coding: utf-8 -*-
# Fetch VOA Special English
# wangmengyin 2012-02-26
import urllib.request
import urllib.error
from xml.dom import minidom, Node
from os import system, path
import os
import re
import sys
from subprocess import call
import pickle

localmp3 = False
localjpg = False

#rss_link = 'http://www.voanews.com/templates/Articles.rss?' \
#           'sectionPath=/learningenglish/home'
rss_link = 'http://rss2proxy.appspot.com'
proxy_url = 'http://203.208.46.1:80/' # ip of google.cn
# article content page template
template_file = 'voa_special_english_content.html'

def main():
    # clear buffer files
    clearBuf()
    # read html template file
    template = readtemplateFile('rss', template_file)
    # fetch rss via proxy
    print('Fetch rss seed, via {%s} proxy, wait.' %proxy_url)
    proxies = {'http': proxy_url}
    opener = urllib.request.FancyURLopener(proxies)
    link = opener.open(rss_link)
    # parse rss content
    print('Parse content')
    articles = []
    dom = minidom.parse(link)
    page_num = 0
    for node in dom.getElementsByTagName('item'):
        mp3 = handleMp3(node)
        jpg = handlePic(node)
        article_content = handleArticleContent(node)
        # skip invaild page
        if mp3 == '' or jpg == '' or article_content == '':
            continue
        # parse summary
        article = {
            'title' : handleText(node, 'title'),
            'link' : handleText(node, 'link'),
            'pubDate' : handleText(node, 'pubDate'),
            'description' : handleText(node, 'description'),
            'mp3' : mp3,
            'jpg' : jpg
            }
        articles.append(article)
        print('Fecth article[%d]: %s' %(page_num + 1, article['title']))
        if localjpg:
            # save pic
            pathPic = savePic(article['jpg'])
            if pathPic != '':
                article['jpg'] = pathPic # replace pic with local buffer.
        if localmp3:
            # save mp3
            pathMp3 = saveMp3(article['mp3'])
            if pathMp3 != '':
                article['mp3'] = pathMp3 # replace mp3 with local buffer.
        # generate article page
        html = generateArticle(template, article, article_content)
        html_path = path.join('rss',
                              "{0:d}{1}".format(page_num + 1, '.html'))
        f = open(html_path, 'w', encoding = 'utf-8')
        f.write(html)
        f.write
        f.close()
        page_num = page_num + 1
    print('\nGet %d articles, saved to html page.' %page_num)
    # generate rss index
    f = open('rss.xml', 'w', encoding = 'utf-8')
    f.write(genXml(articles))
    f.close()

def getText(nodelist):
    rc = []
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            rc.append(node.data)
    return ''.join(rc)

def handleText(node, s):
    return getText(node.getElementsByTagName(s)[0].childNodes)

def handleMp3(node):
    elements = node.getElementsByTagName('media:content')
    if len(elements) != 0:
        for element in elements:
            if element.hasAttribute('type'):
                return element.getAttribute('url')
    return ''

def handlePic(node):
    elements = node.getElementsByTagName('enclosure')
    if len(elements) != 0:
        for element in elements:
            if element.hasAttribute('url'):
                return element.getAttribute('url')
    return ''

def handleArticleContent(node):
    # parse embedded html content
    elements = node.getElementsByTagName('content:encoded')
    if len(elements) != 0:
        content = elements[0].toxml()
        # format html file
        match = re.search(r'</object>', content)
        if match:
            # 27 = len('</div>]]></content:encoded>')
            content = content[match.end(): -27]
            return content
    return ''

def generateArticle(template, article, article_content):
    html = template.format(link = article['link'],
                           title = article['title'],
                           pubDate = article['pubDate'],
                           jpg = article['jpg'],
                           mp3 = article['mp3'],
                           content = article_content)
    return html

def readtemplateFile(directory, filename):
    fp = path.join(os.getcwd(), directory, filename)
    print('Read html template file: %s' %fp)
    f = open(fp, 'r', encoding = 'utf-8')
    template = f.read()
    f.close()
    return template

#function that downloads a file
def downloadFile(file_name,file_mode,url):

    ok = False
    # Open the url
    try:
            print("downloading ", url)
            f = urllib.request.urlopen(url)

            # Open our local file for writing
            local_file = open(file_name, "w" + file_mode)
            #Write to our local file
            local_file.write(f.read())
            local_file.close()
            ok = True
    #handle errors
    except urllib.error.HTTPError as e:
            print("HTTP Error:",e.code , url)
    except urllib.error.URLError as e:
            print("URL Error:",e.reason , url)
    return ok

# save pic
def savePic(url):
    print('Pic ', end = '')
    file_name = path.join(os.getcwd(), 'rss', 'jpg', url.split('/')[-1])
    ok = downloadFile(file_name, 'b', url)
    if ok:
        return path.join('jpg', url.split('/')[-1])
    else:
        os.remove(file_name)
        return ''

# save mp3 using curl or wget, bacause file size is big.
def saveMp3(url):
    print('Mp3 downloading %s' %url)
    file_name = path.join(os.getcwd(), 'rss', 'mp3', url.split('/')[-1])
    ret = call('curl --connect-timeout 50 -o {0} {1}'.format(file_name, url))
    if ret != 0:
        print('curl Error %d refer to http://linux.die.net/man/1/curl' %ret)
        os.remove(file_name)
        return ''
    else:
        return path.join('mp3', url.split('/')[-1])

# clear temp buffer files
def clearBuf():
    curdir = os.getcwd()
    rmfiles(path.join(curdir, 'rss', 'jpg')) # rm rss/jpg/*.jpg
    rmfiles(path.join(curdir, 'rss', 'mp3')) # rm rss/mp3/*.mp3
    # rm rss/[0-9].html seed.xml
    for root, dirs, files in os.walk(path.join(curdir, 'rss'), topdown=False):
        for name in files:
            p = os.path.join(root, name)
            if re.search(r'[0-9]+.html', p):
                os.remove(p)
    # rm rss.xml
    if os.path.exists(os.path.join(curdir, 'rss.xml')):
        os.remove(os.path.join(curdir, 'rss.xml'))

# remove files in curdir
def rmfiles(curdir):
    for root, dirs, files in os.walk(curdir, topdown=False):
        for name in files:
            os.remove(os.path.join(root, name))

# generate article list xml
def genXml(articles):
    s = ''
    s = s + '<rssfeed>' + '\n'
    for article in articles:
        s = s + handleNode(article) + '\n'
    s = s + '</rssfeed>'
    return s

def handleNode(article):
    s = ''
    s = s + '<item>' + '\n'
    for key in article.keys():
        s = s + '\t' + handleElement(article, key)
    s = s + '</item>' + '\n'
    return s

def handleElement(article, i):
    return '<{0}>{1}</{0}>\n'.format(i, article[i])

# Standard boilerplate to call the main() function to begin
# the program.
if __name__ == '__main__':
  main()
	# -- coding: utf-8 --
	# Fetch VOA Special English
	# wangmengyin 2012-02-26
	import urllib.request
	import urllib.error
	from xml.dom import minidom, Node
	from os import system, path
	import os
	import re
	import sys
	from subprocess import call
	import pickle

	localmp3 = False
	localjpg = False

	#rss_link = 'http://www.voanews.com/templates/Articles.rss?' \
	# 'sectionPath=/learningenglish/home'
	rss_link = 'http://rss2proxy.appspot.com'
	proxy_url = 'http://203.208.46.1:80/' # ip of google.cn
	# article content page template
	template_file = 'voa_special_english_content.html'

	def main():
	# clear buffer files
	clearBuf()
	# read html template file
	template = readtemplateFile('rss', template_file)
	# fetch rss via proxy
	print('Fetch rss seed, via {%s} proxy, wait.' %proxy_url)
	proxies = {'http': proxy_url}
	opener = urllib.request.FancyURLopener(proxies)
	link = opener.open(rss_link)
	# parse rss content
	print('Parse content')
	articles = []
	dom = minidom.parse(link)
	page_num = 0
	for node in dom.getElementsByTagName('item'):
	mp3 = handleMp3(node)
	jpg = handlePic(node)
	article_content = handleArticleContent(node)
	# skip invaild page
	if mp3 == '' or jpg == '' or article_content == '':
	continue
	# parse summary
	article = {
	'title' : handleText(node, 'title'),
	'link' : handleText(node, 'link'),
	'pubDate' : handleText(node, 'pubDate'),
	'description' : handleText(node, 'description'),
	'mp3' : mp3,
	'jpg' : jpg
	}
	articles.append(article)
	print('Fecth article[%d]: %s' %(page_num + 1, article['title']))
	if localjpg:
	# save pic
	pathPic = savePic(article['jpg'])
	if pathPic != '':
	article['jpg'] = pathPic # replace pic with local buffer.
	if localmp3:
	# save mp3
	pathMp3 = saveMp3(article['mp3'])
	if pathMp3 != '':
	article['mp3'] = pathMp3 # replace mp3 with local buffer.
	# generate article page
	html = generateArticle(template, article, article_content)
	html_path = path.join('rss',
	"{0:d}{1}".format(page_num + 1, '.html'))
	f = open(html_path, 'w', encoding = 'utf-8')
	f.write(html)
	f.write
	f.close()
	page_num = page_num + 1
	print('\nGet %d articles, saved to html page.' %page_num)
	# generate rss index
	f = open('rss.xml', 'w', encoding = 'utf-8')
	f.write(genXml(articles))
	f.close()

	def getText(nodelist):
	rc = []
	for node in nodelist:
	if node.nodeType == node.TEXT_NODE:
	rc.append(node.data)
	return ''.join(rc)

	def handleText(node, s):
	return getText(node.getElementsByTagName(s)[0].childNodes)

	def handleMp3(node):
	elements = node.getElementsByTagName('media:content')
	if len(elements) != 0:
	for element in elements:
	if element.hasAttribute('type'):
	return element.getAttribute('url')
	return ''

	def handlePic(node):
	elements = node.getElementsByTagName('enclosure')
	if len(elements) != 0:
	for element in elements:
	if element.hasAttribute('url'):
	return element.getAttribute('url')
	return ''

	def handleArticleContent(node):
	# parse embedded html content
	elements = node.getElementsByTagName('content:encoded')
	if len(elements) != 0:
	content = elements[0].toxml()
	# format html file
	match = re.search(r'</object>', content)
	if match:
	# 27 = len('</div>]]></content:encoded>')
	content = content[match.end(): -27]
	return content
	return ''

	def generateArticle(template, article, article_content):
	html = template.format(link = article['link'],
	title = article['title'],
	pubDate = article['pubDate'],
	jpg = article['jpg'],
	mp3 = article['mp3'],
	content = article_content)
	return html

	def readtemplateFile(directory, filename):
	fp = path.join(os.getcwd(), directory, filename)
	print('Read html template file: %s' %fp)
	f = open(fp, 'r', encoding = 'utf-8')
	template = f.read()
	f.close()
	return template

	#function that downloads a file
	def downloadFile(file_name,file_mode,url):

	ok = False
	# Open the url
	try:
	print("downloading ", url)
	f = urllib.request.urlopen(url)

	# Open our local file for writing
	local_file = open(file_name, "w" + file_mode)
	#Write to our local file
	local_file.write(f.read())
	local_file.close()
	ok = True
	#handle errors
	except urllib.error.HTTPError as e:
	print("HTTP Error:",e.code , url)
	except urllib.error.URLError as e:
	print("URL Error:",e.reason , url)
	return ok

	# save pic
	def savePic(url):
	print('Pic ', end = '')
	file_name = path.join(os.getcwd(), 'rss', 'jpg', url.split('/')[-1])
	ok = downloadFile(file_name, 'b', url)
	if ok:
	return path.join('jpg', url.split('/')[-1])
	else:
	os.remove(file_name)
	return ''

	# save mp3 using curl or wget, bacause file size is big.
	def saveMp3(url):
	print('Mp3 downloading %s' %url)
	file_name = path.join(os.getcwd(), 'rss', 'mp3', url.split('/')[-1])
	ret = call('curl --connect-timeout 50 -o {0} {1}'.format(file_name, url))
	if ret != 0:
	print('curl Error %d refer to http://linux.die.net/man/1/curl' %ret)
	os.remove(file_name)
	return ''
	else:
	return path.join('mp3', url.split('/')[-1])

	# clear temp buffer files
	def clearBuf():
	curdir = os.getcwd()
	rmfiles(path.join(curdir, 'rss', 'jpg')) # rm rss/jpg/*.jpg
	rmfiles(path.join(curdir, 'rss', 'mp3')) # rm rss/mp3/*.mp3
	# rm rss/[0-9].html seed.xml
	for root, dirs, files in os.walk(path.join(curdir, 'rss'), topdown=False):
	for name in files:
	p = os.path.join(root, name)
	if re.search(r'[0-9]+.html', p):
	os.remove(p)
	# rm rss.xml
	if os.path.exists(os.path.join(curdir, 'rss.xml')):
	os.remove(os.path.join(curdir, 'rss.xml'))

	# remove files in curdir
	def rmfiles(curdir):
	for root, dirs, files in os.walk(curdir, topdown=False):
	for name in files:
	os.remove(os.path.join(root, name))

	# generate article list xml
	def genXml(articles):
	s = ''
	s = s + '<rssfeed>' + '\n'
	for article in articles:
	s = s + handleNode(article) + '\n'
	s = s + '</rssfeed>'
	return s

	def handleNode(article):
	s = ''
	s = s + '<item>' + '\n'
	for key in article.keys():
	s = s + '\t' + handleElement(article, key)
	s = s + '</item>' + '\n'
	return s

	def handleElement(article, i):
	return '<{0}>{1}</{0}>\n'.format(i, article[i])

	# Standard boilerplate to call the main() function to begin
	# the program.
	if __name__ == '__main__':
	main()