nezuQ/ncxml2csv.py

## ncxml2csv.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# ncxml2csv.py
# Copyright (c) 2014 nezuq
# This software is released under the MIT License.
# http://opensource.org/licenses/mit-license.php

from __future__ import unicode_literals
import sys
import codecs
from lxml import etree
import copy

argvs = sys.argv
argc = len(argvs)

#入力ファイル
FILE_INPUT = 'INPUT.xml'
if 1 < argc:
    FILE_INPUT = argvs[1].decode('UTF-8')

#しきい値（出現回数が指定範囲外のタグのカラムは出力しない）
MIN_COUNT = 3
if 2 < argc:
    MIN_COUNT = int(argvs[2])
MAX_COUNT = 9999
if 3 < argc:
    MAX_COUNT = int(argvs[3])

#元データ出力フラグ
DISP_SRCCOL = 1
if 4 < argc:
    DISP_SRCCOL = int(argvs[4])

#列名
COLUMNS_NAME = ['video_id','user_id','deleted','title','description','length_in_seconds','length','size_high','size_low',
                'movie_type','thumbnail_url','upload_time','first_retrieve','default_thread',
                'view_counter','comment_num','mylist_counter',
                'last_res_body','watch_url','thumb_type','embeddable','no_live_play',
                'option_flag_ichiba','option_flag_community','option_flag_domestic','option_flag_comment_type',
                'option_flag_adult','option_flag_mobile','option_flag_economy_mp4','option_flag_middle_video',
                'option_flag_mobile_ng_apple','main_category','main_category_key',
                'thread_id','thread_public','thread_num_res','thread_community_id','tags']

def main():
    rows = []
    tags = {}
    tags_default_col = []
    tree = etree.parse(FILE_INPUT)
    for vi in tree.findall('./video_info'):
        row = []
        row.append(vi.find('video/id').text) #video_id
        row.append(vi.find('video/user_id').text) #user_id
        row.append(vi.find('video/deleted').text) #deleted
        row.append(vi.find('video/title').text) #title
        row.append(vi.find('video/description').text) #description
        row.append(vi.find('video/length_in_seconds').text) #length_in_seconds
        row.append('') #length
        row.append('') #size_high
        row.append(vi.find('video/size_low').text) #size_low
        row.append(vi.find('video/movie_type').text) #movie_type
        row.append(vi.find('video/thumbnail_url').text) #thumbnail_url
        row.append(vi.find('video/upload_time').text) #upload_time
        row.append(vi.find('video/first_retrieve').text) #first_retrieve
        row.append(vi.find('video/default_thread').text) #default_thread
        row.append(vi.find('video/view_counter').text) #view_counter
        row.append('') #comment_num
        row.append(vi.find('video/mylist_counter').text) #mylist_counter
        row.append('') #last_res_body
        row.append('') #watch_url
        row.append('') #thumb_type
        row.append('') #embeddable
        row.append('') #no_live_play
        row.append(vi.find('video/option_flag_ichiba').text) #option_flag_ichiba
        row.append(vi.find('video/option_flag_community').text) #option_flag_community
        row.append(vi.find('video/option_flag_domestic').text) #option_flag_domestic
        row.append(vi.find('video/option_flag_comment_type').text) #option_flag_comment_type
        row.append(vi.find('video/option_flag_adult').text) #option_flag_adult
        row.append(vi.find('video/option_flag_mobile').text) #option_flag_mobile
        row.append(vi.find('video/option_flag_economy_mp4').text) #option_flag_economy_mp4
        row.append(vi.find('video/option_flag_middle_video').text) #option_flag_middle_video
        row.append(vi.find('video/option_flag_mobile_ng_apple').text) #option_flag_mobile_ng_apple
        row.append(vi.find('video/main_category').text) #main_category
        row.append(vi.find('video/main_category_key').text) #main_category_key
        row.append(vi.find('thread/id').text) #thread_id
        row.append(vi.find('thread/public').text) #thread_public
        row.append(vi.find('thread/num_res').text) #thread_num_res
        row.append(vi.find('thread/community_id').text) #thread_community_id
        row.append(etree.tostring(vi.find('tags'))) #tags
        rows.append((map(lambda x:x.replace(',', '，') if x else '', row)))
        tagname_per_row = map(lambda x:x.text, vi.findall('tags/tag_info/tag'))
        tagname_all = list(set(tags.keys() + tagname_per_row))
        for tagname in tagname_all:
            if tagname not in tags.keys():
                tags[tagname] = copy.copy(tags_default_col)
            if tagname in tagname_per_row:
                tags[tagname].append(1)
            else:
                tags[tagname].append(0)
        tags_default_col.append(0)
    tags_matched = []
    for key,val in tags.items():
        cnt = reduce(lambda x,y:x+y, val)
        if MIN_COUNT <= cnt <= MAX_COUNT:
            tags_matched.append((key, val, cnt))
    sorted_tags = sorted(tags_matched, key = (lambda x:x[2]), reverse = True)
    print ','.join((COLUMNS_NAME if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[0]), sorted_tags))
    for i, row in enumerate(rows):
        print ','.join((row if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[1][i]), sorted_tags))

if __name__ == '__main__':
    sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
    main()

## ncxml2csv.pyを実行する
python ncxml2csv.py INPUT.xml 3 9999 1 > OUTPUT.csv
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# ncxml2csv.py
	# Copyright (c) 2014 nezuq
	# This software is released under the MIT License.
	# http://opensource.org/licenses/mit-license.php

	from __future__ import unicode_literals
	import sys
	import codecs
	from lxml import etree
	import copy

	argvs = sys.argv
	argc = len(argvs)

	#入力ファイル
	FILE_INPUT = 'INPUT.xml'
	if 1 < argc:
	FILE_INPUT = argvs[1].decode('UTF-8')

	#しきい値（出現回数が指定範囲外のタグのカラムは出力しない）
	MIN_COUNT = 3
	if 2 < argc:
	MIN_COUNT = int(argvs[2])
	MAX_COUNT = 9999
	if 3 < argc:
	MAX_COUNT = int(argvs[3])

	#元データ出力フラグ
	DISP_SRCCOL = 1
	if 4 < argc:
	DISP_SRCCOL = int(argvs[4])

	#列名
	COLUMNS_NAME = ['video_id','user_id','deleted','title','description','length_in_seconds','length','size_high','size_low',
	'movie_type','thumbnail_url','upload_time','first_retrieve','default_thread',
	'view_counter','comment_num','mylist_counter',
	'last_res_body','watch_url','thumb_type','embeddable','no_live_play',
	'option_flag_ichiba','option_flag_community','option_flag_domestic','option_flag_comment_type',
	'option_flag_adult','option_flag_mobile','option_flag_economy_mp4','option_flag_middle_video',
	'option_flag_mobile_ng_apple','main_category','main_category_key',
	'thread_id','thread_public','thread_num_res','thread_community_id','tags']

	def main():
	rows = []
	tags = {}
	tags_default_col = []
	tree = etree.parse(FILE_INPUT)
	for vi in tree.findall('./video_info'):
	row = []
	row.append(vi.find('video/id').text) #video_id
	row.append(vi.find('video/user_id').text) #user_id
	row.append(vi.find('video/deleted').text) #deleted
	row.append(vi.find('video/title').text) #title
	row.append(vi.find('video/description').text) #description
	row.append(vi.find('video/length_in_seconds').text) #length_in_seconds
	row.append('') #length
	row.append('') #size_high
	row.append(vi.find('video/size_low').text) #size_low
	row.append(vi.find('video/movie_type').text) #movie_type
	row.append(vi.find('video/thumbnail_url').text) #thumbnail_url
	row.append(vi.find('video/upload_time').text) #upload_time
	row.append(vi.find('video/first_retrieve').text) #first_retrieve
	row.append(vi.find('video/default_thread').text) #default_thread
	row.append(vi.find('video/view_counter').text) #view_counter
	row.append('') #comment_num
	row.append(vi.find('video/mylist_counter').text) #mylist_counter
	row.append('') #last_res_body
	row.append('') #watch_url
	row.append('') #thumb_type
	row.append('') #embeddable
	row.append('') #no_live_play
	row.append(vi.find('video/option_flag_ichiba').text) #option_flag_ichiba
	row.append(vi.find('video/option_flag_community').text) #option_flag_community
	row.append(vi.find('video/option_flag_domestic').text) #option_flag_domestic
	row.append(vi.find('video/option_flag_comment_type').text) #option_flag_comment_type
	row.append(vi.find('video/option_flag_adult').text) #option_flag_adult
	row.append(vi.find('video/option_flag_mobile').text) #option_flag_mobile
	row.append(vi.find('video/option_flag_economy_mp4').text) #option_flag_economy_mp4
	row.append(vi.find('video/option_flag_middle_video').text) #option_flag_middle_video
	row.append(vi.find('video/option_flag_mobile_ng_apple').text) #option_flag_mobile_ng_apple
	row.append(vi.find('video/main_category').text) #main_category
	row.append(vi.find('video/main_category_key').text) #main_category_key
	row.append(vi.find('thread/id').text) #thread_id
	row.append(vi.find('thread/public').text) #thread_public
	row.append(vi.find('thread/num_res').text) #thread_num_res
	row.append(vi.find('thread/community_id').text) #thread_community_id
	row.append(etree.tostring(vi.find('tags'))) #tags
	rows.append((map(lambda x:x.replace(',', '，') if x else '', row)))
	tagname_per_row = map(lambda x:x.text, vi.findall('tags/tag_info/tag'))
	tagname_all = list(set(tags.keys() + tagname_per_row))
	for tagname in tagname_all:
	if tagname not in tags.keys():
	tags[tagname] = copy.copy(tags_default_col)
	if tagname in tagname_per_row:
	tags[tagname].append(1)
	else:
	tags[tagname].append(0)
	tags_default_col.append(0)
	tags_matched = []
	for key,val in tags.items():
	cnt = reduce(lambda x,y:x+y, val)
	if MIN_COUNT <= cnt <= MAX_COUNT:
	tags_matched.append((key, val, cnt))
	sorted_tags = sorted(tags_matched, key = (lambda x:x[2]), reverse = True)
	print ','.join((COLUMNS_NAME if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[0]), sorted_tags))
	for i, row in enumerate(rows):
	print ','.join((row if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[1][i]), sorted_tags))

	if __name__ == '__main__':
	sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
	main()