Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Last active June 22, 2016 06:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ikegami-yukino/3033386 to your computer and use it in GitHub Desktop.
Save ikegami-yukino/3033386 to your computer and use it in GitHub Desktop.
しょぼかるからアニメのタイトルと読み仮名、キャラクター名を取得する
# -*- coding: utf-8 -*-
"""
しょぼかるからアニメのタイトルと読み仮名、キャラクター名を取得する
引数で結果を書き込むファイルを指定します
"""
import sys, urllib2, re, codecs, time
base_url = 'http://cal.syoboi.jp'
start_url = base_url+'/list'
re_title_url = re.compile(r'<a href="(/tid/[0-9]+)">')
title_name = re.compile(r'<meta name="keywords" content="([^,]+),')
title_yomi = re.compile(ur'<tr><th>よみ</th><td>([^<]+)</td></tr>')
character = re.compile(r'<tr><th[^>]*>([^<]+)</th><td><wbr/>')
section_cast = re.compile(ur'.*<div class="title">キャスト</div>',re.M | re.S)
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/ 534.51.22')]
animes = []
if __name__ == '__main__':
if len(sys.argv) < 2:
print u'!!!結果を書き込むファイルを指定してください!!!'
sys.exit()
start_html = opener.open(start_url).read().decode('utf8')
for title in re_title_url.findall(start_html):
time.sleep(0.75)
title_html = opener.open(base_url+title).read().decode('utf8')
animes.append(title_name.search(title_html).group(1)+','+''.join(title_yomi.findall(title_html)))
print animes[-1]
animes.extend(character.findall(section_cast.sub('',title_html)))
with codecs.open(sys.argv[1],'w','utf8') as output:
output.write('\n'.join(animes))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment