Created
May 1, 2014 21:45
-
-
Save nezuQ/11462600 to your computer and use it in GitHub Desktop.
Pythonで前処理。ニコニコ動画のタグ検索結果をCSV形式に変換する ref: http://qiita.com/nezuq/items/eedfce32ade1bab8f94f
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# ncxml2csv.py | |
# Copyright (c) 2014 nezuq | |
# This software is released under the MIT License. | |
# http://opensource.org/licenses/mit-license.php | |
from __future__ import unicode_literals | |
import sys | |
import codecs | |
from lxml import etree | |
import copy | |
argvs = sys.argv | |
argc = len(argvs) | |
#入力ファイル | |
FILE_INPUT = 'INPUT.xml' | |
if 1 < argc: | |
FILE_INPUT = argvs[1].decode('UTF-8') | |
#しきい値(出現回数が指定範囲外のタグのカラムは出力しない) | |
MIN_COUNT = 3 | |
if 2 < argc: | |
MIN_COUNT = int(argvs[2]) | |
MAX_COUNT = 9999 | |
if 3 < argc: | |
MAX_COUNT = int(argvs[3]) | |
#元データ出力フラグ | |
DISP_SRCCOL = 1 | |
if 4 < argc: | |
DISP_SRCCOL = int(argvs[4]) | |
#列名 | |
COLUMNS_NAME = ['video_id','user_id','deleted','title','description','length_in_seconds','length','size_high','size_low', | |
'movie_type','thumbnail_url','upload_time','first_retrieve','default_thread', | |
'view_counter','comment_num','mylist_counter', | |
'last_res_body','watch_url','thumb_type','embeddable','no_live_play', | |
'option_flag_ichiba','option_flag_community','option_flag_domestic','option_flag_comment_type', | |
'option_flag_adult','option_flag_mobile','option_flag_economy_mp4','option_flag_middle_video', | |
'option_flag_mobile_ng_apple','main_category','main_category_key', | |
'thread_id','thread_public','thread_num_res','thread_community_id','tags'] | |
def main(): | |
rows = [] | |
tags = {} | |
tags_default_col = [] | |
tree = etree.parse(FILE_INPUT) | |
for vi in tree.findall('./video_info'): | |
row = [] | |
row.append(vi.find('video/id').text) #video_id | |
row.append(vi.find('video/user_id').text) #user_id | |
row.append(vi.find('video/deleted').text) #deleted | |
row.append(vi.find('video/title').text) #title | |
row.append(vi.find('video/description').text) #description | |
row.append(vi.find('video/length_in_seconds').text) #length_in_seconds | |
row.append('') #length | |
row.append('') #size_high | |
row.append(vi.find('video/size_low').text) #size_low | |
row.append(vi.find('video/movie_type').text) #movie_type | |
row.append(vi.find('video/thumbnail_url').text) #thumbnail_url | |
row.append(vi.find('video/upload_time').text) #upload_time | |
row.append(vi.find('video/first_retrieve').text) #first_retrieve | |
row.append(vi.find('video/default_thread').text) #default_thread | |
row.append(vi.find('video/view_counter').text) #view_counter | |
row.append('') #comment_num | |
row.append(vi.find('video/mylist_counter').text) #mylist_counter | |
row.append('') #last_res_body | |
row.append('') #watch_url | |
row.append('') #thumb_type | |
row.append('') #embeddable | |
row.append('') #no_live_play | |
row.append(vi.find('video/option_flag_ichiba').text) #option_flag_ichiba | |
row.append(vi.find('video/option_flag_community').text) #option_flag_community | |
row.append(vi.find('video/option_flag_domestic').text) #option_flag_domestic | |
row.append(vi.find('video/option_flag_comment_type').text) #option_flag_comment_type | |
row.append(vi.find('video/option_flag_adult').text) #option_flag_adult | |
row.append(vi.find('video/option_flag_mobile').text) #option_flag_mobile | |
row.append(vi.find('video/option_flag_economy_mp4').text) #option_flag_economy_mp4 | |
row.append(vi.find('video/option_flag_middle_video').text) #option_flag_middle_video | |
row.append(vi.find('video/option_flag_mobile_ng_apple').text) #option_flag_mobile_ng_apple | |
row.append(vi.find('video/main_category').text) #main_category | |
row.append(vi.find('video/main_category_key').text) #main_category_key | |
row.append(vi.find('thread/id').text) #thread_id | |
row.append(vi.find('thread/public').text) #thread_public | |
row.append(vi.find('thread/num_res').text) #thread_num_res | |
row.append(vi.find('thread/community_id').text) #thread_community_id | |
row.append(etree.tostring(vi.find('tags'))) #tags | |
rows.append((map(lambda x:x.replace(',', ',') if x else '', row))) | |
tagname_per_row = map(lambda x:x.text, vi.findall('tags/tag_info/tag')) | |
tagname_all = list(set(tags.keys() + tagname_per_row)) | |
for tagname in tagname_all: | |
if tagname not in tags.keys(): | |
tags[tagname] = copy.copy(tags_default_col) | |
if tagname in tagname_per_row: | |
tags[tagname].append(1) | |
else: | |
tags[tagname].append(0) | |
tags_default_col.append(0) | |
tags_matched = [] | |
for key,val in tags.items(): | |
cnt = reduce(lambda x,y:x+y, val) | |
if MIN_COUNT <= cnt <= MAX_COUNT: | |
tags_matched.append((key, val, cnt)) | |
sorted_tags = sorted(tags_matched, key = (lambda x:x[2]), reverse = True) | |
print ','.join((COLUMNS_NAME if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[0]), sorted_tags)) | |
for i, row in enumerate(rows): | |
print ','.join((row if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[1][i]), sorted_tags)) | |
if __name__ == '__main__': | |
sys.stdout = codecs.getwriter('utf_8')(sys.stdout) | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python ncxml2csv.py INPUT.xml 3 9999 1 > OUTPUT.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment