Skip to content

Instantly share code, notes, and snippets.

@nezuQ
Created May 1, 2014 21:45
Show Gist options
  • Save nezuQ/11462600 to your computer and use it in GitHub Desktop.
Save nezuQ/11462600 to your computer and use it in GitHub Desktop.
Pythonで前処理。ニコニコ動画のタグ検索結果をCSV形式に変換する ref: http://qiita.com/nezuq/items/eedfce32ade1bab8f94f
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ncxml2csv.py
# Copyright (c) 2014 nezuq
# This software is released under the MIT License.
# http://opensource.org/licenses/mit-license.php
from __future__ import unicode_literals
import sys
import codecs
from lxml import etree
import copy
argvs = sys.argv
argc = len(argvs)
#入力ファイル
FILE_INPUT = 'INPUT.xml'
if 1 < argc:
FILE_INPUT = argvs[1].decode('UTF-8')
#しきい値(出現回数が指定範囲外のタグのカラムは出力しない)
MIN_COUNT = 3
if 2 < argc:
MIN_COUNT = int(argvs[2])
MAX_COUNT = 9999
if 3 < argc:
MAX_COUNT = int(argvs[3])
#元データ出力フラグ
DISP_SRCCOL = 1
if 4 < argc:
DISP_SRCCOL = int(argvs[4])
#列名
COLUMNS_NAME = ['video_id','user_id','deleted','title','description','length_in_seconds','length','size_high','size_low',
'movie_type','thumbnail_url','upload_time','first_retrieve','default_thread',
'view_counter','comment_num','mylist_counter',
'last_res_body','watch_url','thumb_type','embeddable','no_live_play',
'option_flag_ichiba','option_flag_community','option_flag_domestic','option_flag_comment_type',
'option_flag_adult','option_flag_mobile','option_flag_economy_mp4','option_flag_middle_video',
'option_flag_mobile_ng_apple','main_category','main_category_key',
'thread_id','thread_public','thread_num_res','thread_community_id','tags']
def main():
rows = []
tags = {}
tags_default_col = []
tree = etree.parse(FILE_INPUT)
for vi in tree.findall('./video_info'):
row = []
row.append(vi.find('video/id').text) #video_id
row.append(vi.find('video/user_id').text) #user_id
row.append(vi.find('video/deleted').text) #deleted
row.append(vi.find('video/title').text) #title
row.append(vi.find('video/description').text) #description
row.append(vi.find('video/length_in_seconds').text) #length_in_seconds
row.append('') #length
row.append('') #size_high
row.append(vi.find('video/size_low').text) #size_low
row.append(vi.find('video/movie_type').text) #movie_type
row.append(vi.find('video/thumbnail_url').text) #thumbnail_url
row.append(vi.find('video/upload_time').text) #upload_time
row.append(vi.find('video/first_retrieve').text) #first_retrieve
row.append(vi.find('video/default_thread').text) #default_thread
row.append(vi.find('video/view_counter').text) #view_counter
row.append('') #comment_num
row.append(vi.find('video/mylist_counter').text) #mylist_counter
row.append('') #last_res_body
row.append('') #watch_url
row.append('') #thumb_type
row.append('') #embeddable
row.append('') #no_live_play
row.append(vi.find('video/option_flag_ichiba').text) #option_flag_ichiba
row.append(vi.find('video/option_flag_community').text) #option_flag_community
row.append(vi.find('video/option_flag_domestic').text) #option_flag_domestic
row.append(vi.find('video/option_flag_comment_type').text) #option_flag_comment_type
row.append(vi.find('video/option_flag_adult').text) #option_flag_adult
row.append(vi.find('video/option_flag_mobile').text) #option_flag_mobile
row.append(vi.find('video/option_flag_economy_mp4').text) #option_flag_economy_mp4
row.append(vi.find('video/option_flag_middle_video').text) #option_flag_middle_video
row.append(vi.find('video/option_flag_mobile_ng_apple').text) #option_flag_mobile_ng_apple
row.append(vi.find('video/main_category').text) #main_category
row.append(vi.find('video/main_category_key').text) #main_category_key
row.append(vi.find('thread/id').text) #thread_id
row.append(vi.find('thread/public').text) #thread_public
row.append(vi.find('thread/num_res').text) #thread_num_res
row.append(vi.find('thread/community_id').text) #thread_community_id
row.append(etree.tostring(vi.find('tags'))) #tags
rows.append((map(lambda x:x.replace(',', ',') if x else '', row)))
tagname_per_row = map(lambda x:x.text, vi.findall('tags/tag_info/tag'))
tagname_all = list(set(tags.keys() + tagname_per_row))
for tagname in tagname_all:
if tagname not in tags.keys():
tags[tagname] = copy.copy(tags_default_col)
if tagname in tagname_per_row:
tags[tagname].append(1)
else:
tags[tagname].append(0)
tags_default_col.append(0)
tags_matched = []
for key,val in tags.items():
cnt = reduce(lambda x,y:x+y, val)
if MIN_COUNT <= cnt <= MAX_COUNT:
tags_matched.append((key, val, cnt))
sorted_tags = sorted(tags_matched, key = (lambda x:x[2]), reverse = True)
print ','.join((COLUMNS_NAME if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[0]), sorted_tags))
for i, row in enumerate(rows):
print ','.join((row if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[1][i]), sorted_tags))
if __name__ == '__main__':
sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
main()
python ncxml2csv.py INPUT.xml 3 9999 1 > OUTPUT.csv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment