Skip to content

Instantly share code, notes, and snippets.

@bluele
Created April 2, 2013 22:47
Show Gist options
  • Save bluele/5296898 to your computer and use it in GitHub Desktop.
Save bluele/5296898 to your computer and use it in GitHub Desktop.
Get tv schedule.
#-*- coding:utf-8 -*-
__author__ = 'bluele'
from BeautifulSoup import BeautifulSoup as bs
import requests
import re
url = 'http://program.tv.jp.msn.com/tv.php?site=032&mode=06&category=g&area=013&template=program&sdate=20130321&lhour=7&shour=05'
host = 'http://program.tv.jp.msn.com/tv.php'
def get_items(soup):
u""" 番組を格納したlineを取得します """
return soup.findAll('td', valign='top')
def translate(string):
# 実体参照をタグに変換
_string = string.replace('&lt;', '<')
return _string.replace('&gt;', '>')
def strip_tag(string):
# string中の実体参照をタグに置き換えてからタグを削除
pat = re.compile(ur'<[^>]+>')
return pat.sub(u'', translate(string))
def parse_item(item):
""" 指定した番組から情報を
@return {
'channel': 放送局
'time': 放映時間
}
"""
dummy = {
'channel': 'dummy',
'time': 'dummy'
}
# 番組タイトルを取得
info = item.firstText()
title = strip_tag(info.find('h1').text)
# Like: "NHK 04:30~08:00"
channel_time = info.find('h2').text
channel, time = channel_time.split(u' ')
channel_info = info.findAll('p')
description = strip_tag(channel_info[0].text)
casts_raw = strip_tag(channel_info[1].text)
casts = casts_raw.strip(u' ').strip(u' ').split(u' ')
if not (len(casts) and casts[0] != ""):
casts = list()
return {
'channel': channel,
'time': time,
'title': title,
'description': description,
'casts': casts
}
def get_schedule(cache=False):
if cache: # cacheを使用
with open('dat/sample.html', 'rb') as f:
return f.read()
headers = {}
params = {
'site': '032',
'mode': '06',
'category': 'g',
'area': '013',
'template': 'program',
'sdate': '20130321',
'lhour': '7',
'shour': '05'
}
response = requests.get(
host,
headers=headers,
params=params
)
return response.content
def execute():
countd = dict()
soup = bs(get_schedule(cache=True))
for item in get_items(soup):
info = parse_item(item)
# print info['channel'] ,info['title'] , info['channel'], info['time'], info['description']
# print info['title'], u','.join(info['casts'])
for cast in info['casts']:
countd.setdefault(cast, 0)
countd[cast] += 1
# print countd
for k, v in countd.iteritems():
if v >= 2:
print k, v
def main():
execute()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment