Created
April 29, 2014 10:16
-
-
Save nezuQ/11396114 to your computer and use it in GitHub Desktop.
ニコニコ動画のタグ検索結果をXML形式で取得する。(ログイン不要) ref: http://qiita.com/nezuq/items/16ad8ebd96f7bda77824
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# nc2xml.py | |
# Copyright (c) 2014 nezuq | |
# This software is released under the MIT License. | |
# http://opensource.org/licenses/mit-license.php | |
import sys | |
import time | |
import datetime | |
import urllib | |
import urllib2 | |
from lxml import etree | |
argvs = sys.argv | |
argc = len(argvs) | |
#検索キーワード | |
KEYWORD = u'キマシタワー' | |
if 1 < argc: | |
KEYWORD = argvs[1].decode('utf-8') | |
#ソート基準 | |
#SORT_TYPE = 'n' #最新コメント投稿日時 | |
#SORT_TYPE = 'v' #再生数 | |
#SORT_TYPE = 'm' #マイリスト | |
#SORT_TYPE = 'r' #コメント数 | |
SORT_TYPE = 'f' #投稿日時 | |
#SORT_TYPE = 'l' #再生時間 | |
if 2 < argc: | |
SORT_TYPE = argvs[2] | |
#対象ページ番号(ID取得開始ページ) | |
MIN_PAGENUMBER = 1 | |
if 3 < argc: | |
MIN_PAGENUMBER = int(argvs[3]) | |
#対象ページ番号(ID取得終了ページ) | |
MAX_PAGENUMBER = 3 | |
if 4 < argc: | |
MAX_PAGENUMBER = int(argvs[4]) | |
#検索方法 | |
SEARCH_TYPE = 'tag' #タグ検索 | |
def main(): | |
keyword = urllib2.quote(KEYWORD.encode("utf-8")) | |
ids = [] | |
nrow_bef = -1 | |
npage = MIN_PAGENUMBER - 1 | |
while ((nrow_bef != len(ids)) and (npage < MAX_PAGENUMBER)): | |
npage += 1 | |
uri = 'http://www.nicovideo.jp/%s/%s?sort=%s&rss=2.0&page=%d' % (SEARCH_TYPE, keyword, SORT_TYPE, npage) | |
print u'動画ID問合せ中 : ' + uri | |
time.sleep(1) | |
res = urllib2.urlopen(uri) | |
rss = etree.fromstring(res.read()) | |
nrow_bef = len(ids) | |
ids += map((lambda x: x.text.rsplit('/', 1)[1]), rss.findall('./channel/item/link')) | |
query = 'http://i.nicovideo.jp/v3/video.array?v=' + ','.join(ids) | |
print u'動画データ取得中 : ' + query | |
d = datetime.datetime.today() | |
# ※問い合わせに20秒以上かかる場合はタイムアウトし、0件となる。 | |
# ※負荷がかかるクエリを発行するのはIPブロックやAPI封鎖の可能性もあるのでNG。 | |
time.sleep(1) | |
urllib.urlretrieve(query, u'%s_%s.xml' % (KEYWORD, d.strftime('%y%m%d%H%M%S'))) | |
print 'end!' | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python nc2xml.py "百合" m 1 3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment