Created
May 27, 2014 13:54
-
-
Save jamiesun/839d0ab89c518755c4cf to your computer and use it in GitHub Desktop.
fetch epg
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:utf-8 | |
import re | |
import urllib | |
import pprint | |
import logging | |
from lxml.html import fromstring,tostring | |
from gevent.pool import Pool | |
pool = Pool(3) | |
def fetch_doc(url): | |
''' 获取链接内容 ''' | |
content = urllib.urlopen(url).read() | |
doc = fromstring(content) | |
doc.make_links_absolute(url) | |
return doc | |
def fetch_channel(index_url): | |
''' 获取分类下的所有频道链接(当天的链接) ''' | |
doc = fetch_doc(index_url) | |
el = doc.find_class('chlsnav') | |
if not el: | |
return None | |
el = el[0] | |
_category = el.find_class("pbar")[0].text_content() | |
_links = [] | |
for li_el in el.findall('ul/li'): | |
_a = li_el.xpath('a') | |
_href = _a and _a[0].attrib['href'] or None | |
_cname = _a and _a[0].text_content() | |
if _href: | |
# 频道,频道链接当天 | |
_links.append((_cname,_href)) | |
return _category,_links | |
def fetch_channel_epg(category,links,debug=True): | |
def _week_links(link_url): | |
''' 根据频道当天链接转换出当前频道一周的链接地址 ''' | |
return ( (re.sub('(-w\d+)','-w%s'%i,link_url),'week-%s'%i) for i in range(1,8) ) | |
def _fetch_epg(args): | |
''' 获取一个链接的节目单 ''' | |
chl,week,url = args[0],args[1],args[2] | |
if debug: | |
print chl.encode('utf-8'),week,url | |
_week_results = {} | |
try: | |
doc = fetch_doc(url) | |
pgrow = doc.get_element_by_id("pgrow") | |
e_lis = pgrow.findall("li") | |
_result = [] | |
for eli in e_lis: | |
tvgd_el = eli.find_class('tvgd') | |
if tvgd_el: | |
eli.remove(tvgd_el[0]) | |
_time_el = eli.find("span") | |
_time = _time_el.text if _time_el is not None else '' | |
_epg_desc = eli.text_content().replace(_time,'') | |
_epg_desc = _epg_desc.replace(u'剧情','') | |
_epg_desc = _epg_desc.replace(u'剧照','') | |
_epg_desc = _epg_desc.replace(u'演员表','') | |
if _time and _epg_desc: | |
# 分类,星期几,频道,时间,节目内容 | |
_result.append((category,week,chl,_time,_epg_desc)) | |
_week_results[week] = _result | |
except: | |
logging.exception("fetch url error %s"%_wlink[0]) | |
return _week_results | |
# 计算所有链接地址 | |
params = [] | |
for chl,link_url in links: | |
for week_url,week in _week_links(link_url): | |
params.append([chl,week,week_url]) | |
return pool.map(_fetch_epg,params) | |
def test_fetch_channel(): | |
index_url = 'http://www.tvmao.com/program/channels' | |
category,links = fetch_channel(index_url) | |
print category.encode('utf-8') | |
for link in links: | |
print link[0].encode('utf-8') | |
print link[1] | |
def test_fetch_epg(): | |
index_url = 'http://www.tvmao.com/program/channels' | |
category,links = fetch_channel(index_url) | |
category_result = fetch_channel_epg(category,[links[0]]) | |
for _result in category_result: | |
_result = sorted(_result.iteritems(),key=lambda d:d[0]) | |
for _rk,_rv in _result: | |
print _rk | |
for vals in _rv: | |
print ','.join([_v.encode('utf-8') for _v in vals] ) | |
print "-"*70 | |
if __name__ == '__main__': | |
# test_fetch_channel() | |
test_fetch_epg() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment