Skip to content

Instantly share code, notes, and snippets.

@jamiesun
Created May 27, 2014 13:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jamiesun/839d0ab89c518755c4cf to your computer and use it in GitHub Desktop.
Save jamiesun/839d0ab89c518755c4cf to your computer and use it in GitHub Desktop.
fetch epg
#coding:utf-8
import re
import urllib
import pprint
import logging
from lxml.html import fromstring,tostring
from gevent.pool import Pool
pool = Pool(3)
def fetch_doc(url):
''' 获取链接内容 '''
content = urllib.urlopen(url).read()
doc = fromstring(content)
doc.make_links_absolute(url)
return doc
def fetch_channel(index_url):
''' 获取分类下的所有频道链接(当天的链接) '''
doc = fetch_doc(index_url)
el = doc.find_class('chlsnav')
if not el:
return None
el = el[0]
_category = el.find_class("pbar")[0].text_content()
_links = []
for li_el in el.findall('ul/li'):
_a = li_el.xpath('a')
_href = _a and _a[0].attrib['href'] or None
_cname = _a and _a[0].text_content()
if _href:
# 频道,频道链接当天
_links.append((_cname,_href))
return _category,_links
def fetch_channel_epg(category,links,debug=True):
def _week_links(link_url):
''' 根据频道当天链接转换出当前频道一周的链接地址 '''
return ( (re.sub('(-w\d+)','-w%s'%i,link_url),'week-%s'%i) for i in range(1,8) )
def _fetch_epg(args):
''' 获取一个链接的节目单 '''
chl,week,url = args[0],args[1],args[2]
if debug:
print chl.encode('utf-8'),week,url
_week_results = {}
try:
doc = fetch_doc(url)
pgrow = doc.get_element_by_id("pgrow")
e_lis = pgrow.findall("li")
_result = []
for eli in e_lis:
tvgd_el = eli.find_class('tvgd')
if tvgd_el:
eli.remove(tvgd_el[0])
_time_el = eli.find("span")
_time = _time_el.text if _time_el is not None else ''
_epg_desc = eli.text_content().replace(_time,'')
_epg_desc = _epg_desc.replace(u'剧情','')
_epg_desc = _epg_desc.replace(u'剧照','')
_epg_desc = _epg_desc.replace(u'演员表','')
if _time and _epg_desc:
# 分类,星期几,频道,时间,节目内容
_result.append((category,week,chl,_time,_epg_desc))
_week_results[week] = _result
except:
logging.exception("fetch url error %s"%_wlink[0])
return _week_results
# 计算所有链接地址
params = []
for chl,link_url in links:
for week_url,week in _week_links(link_url):
params.append([chl,week,week_url])
return pool.map(_fetch_epg,params)
def test_fetch_channel():
index_url = 'http://www.tvmao.com/program/channels'
category,links = fetch_channel(index_url)
print category.encode('utf-8')
for link in links:
print link[0].encode('utf-8')
print link[1]
def test_fetch_epg():
index_url = 'http://www.tvmao.com/program/channels'
category,links = fetch_channel(index_url)
category_result = fetch_channel_epg(category,[links[0]])
for _result in category_result:
_result = sorted(_result.iteritems(),key=lambda d:d[0])
for _rk,_rv in _result:
print _rk
for vals in _rv:
print ','.join([_v.encode('utf-8') for _v in vals] )
print "-"*70
if __name__ == '__main__':
# test_fetch_channel()
test_fetch_epg()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment