fffonion/wiki_movie_names.py

## wiki_movie_names.py
#encoding:utf-8
import httplib2
import lxml.html as lhtml
import re
import os
import urllib
import sys
import gzip
reload(sys)
sys.setdefaultencoding('utf-8')

ht = httplib2.Http()#cache = r"z:/temp/wiki")
regex_speical = '(?:Wikipedia|Portal|Help|Special|Category|Template)(.+)'
not_right = ['日语写法', '日本电影']
class MWException(Exception):
    pass

def get_orig_name(t, l, prefered = 'http://zh.wikipedia.org/zh-cn/', file_prefix = 'USTV_', disambig_keywd = "电视剧"):
    if l.startswith('http:') or l.startswith('https:'):
        try:
            real_keywd = re.findall('(?:wiki/|w/|index.php\?title=)([^\&]+)', l)[0]
        except IndexError:
            raise ValueError(" invalid link %s" % l)
    else:
        real_keywd = l

    url = prefered + real_keywd
    file_key = './cache/' + file_prefix + urllib.unquote(real_keywd.replace('/', '#')).decode('utf-8')
    if os.path.exists(file_key):
        ct = gzip.open(file_key, 'rb').read()
    else:
        resp, ct = ht.request(url)
        if resp['status'] == '200':
            with gzip.open(file_key, 'wb') as f:
                f.write(ct)
        else:
            raise MWException('got %s when processing %s' % (resp['status'], url))
    htmldoc = lhtml.fromstring(ct.decode('utf-8'))
    if ct.find('disambigbox')!=-1:#这是一个消歧义页
        dis_s = htmldoc.xpath('//div[@id="mw-content-text"]/ul/li//a')
        for d in dis_s:
            if d.text_content().encode('utf-8').find(disambig_keywd)!=-1 or d.values()[-1].encode('utf-8').find(disambig_keywd)!=-1:
                new_href = d.values()[0].lstrip('/wiki/')
                if new_href.find('action=edit')==-1:
                    print('REDIR-DIS:%s' % new_href)
                    return get_orig_name(t, new_href, prefered, file_prefix, disambig_keywd)
        raise MWException("Can't define %s" % real_keywd)

    if ct.find('dablink')!=-1:#xxx重定向至此。关于yy，详见zz
        dab_s = htmldoc.xpath('//div[@class="notice dablink"]//a')
        for d in dab_s:
            if d.text_content().encode('utf-8').find(disambig_keywd)!=-1 or d.values()[-1].encode('utf-8').find(disambig_keywd)!=-1:
                new_href = d.values()[0].lstrip('/wiki/')
                if new_href.find('action=edit')==-1:
                    print('REDIR-DAB:%s' % new_href)
                    return get_orig_name(t, new_href, prefered, file_prefix, disambig_keywd)
        #don't raise exp here 'cause current page may be right
        #raise MWException("Can't define %s" % real_keywd)
    try:
        first_p = htmldoc.xpath('//div[@id="mw-content-text"]/p[position()=1]')[0]
    except IndexError:
        raise MWException("not right page")
    first_p_plain = first_p.text_content().encode('utf-8')
    infobox_title = htmldoc.xpath('//th[1]')
    jp_title = htmldoc.xpath('//span[@lang="ja"]')
    p_title = htmldoc.xpath('//span[@dir="auto"]')[0].text_content().encode('utf-8')
    if p_title.endswith(')'):#has (xxx)
        try:
            p_title = p_title.replace(re.findall('\([^\)]+\)$', p_title)[0], '')
        except IndexError:
            print('not changed')
    #open(r'z:/233.txt','w').write(first_p_plain)
    while True:
        maybe_eng = ''
        eng = re.findall(p_title+ '.*?[:：]*([\w\d\s!\?\'",\.:;\&-]+)[）；，\)]', first_p_plain)
        if eng and len(eng[0])>1:
            break
        if infobox_title:
            print('try infobox')
            info_eng = infobox_title[0].text_content().split('\n')
            if len(info_eng) == 1:#只有一条可能为中文，试试能不能匹配
                eng = re.findall(info_eng[0].encode('utf-8')+ '.*?[:：]*([\w\d\s!\?\'",\.:;\&-]+)[）；，\)]', first_p_plain)
                if eng and len(eng[0])>1:
                    print('infoxbox->eng match')
                    break
            elif info_eng:#>1
                print('infobox == eng')
                eng = [info_eng[-1]]
                break
            if info_eng:
                info_eng = info_eng[0]
            if jp_title:
                eng = [jp_title[0].text_content()]
                break
            if len(info_eng) < 50 and info_eng != t and info_eng != p_title:
                print('infobox maybe')
                #raw_input(info_eng.decode('utf-8').encode('gbk'))
                maybe_eng = [info_eng]
                #break
        #try without title
        eng = re.findall('.*?[:：]*([\w\d\s!\?\'",\.:;\&-]+)[）；，\)]', first_p_plain)
        if eng and len(eng[0])>1:
            break
        if maybe_eng:
            eng = maybe_eng
            break
        if not eng or len(eng[0])<=1:
            eng = ['']
            break
            #print mw_content_plain.decode('utf-8').encode('gbk')
            #raise MWException("Original name not found for %s" % real_keywd)
    return eng[0], p_title.decode('utf-8')


def valid_link(link, link_type = 'wiki'):
    if re.findall(regex_speical, link) or not re.findall('/'+link_type + '/', link):
        return False
    else:
        return True

all_names = {}
name_record = []
sql_data = []
sqlfile = r"z:/233.sql"
def make_list(idx_f, file_prefix = 'USTV_', disambig_keywd = "电视剧", sel_node = u"//div[@id='mw-content-text']/*/li/a"):
    page = lhtml.fromstring(open(idx_f).read().decode('utf-8'))
    err_names = []
    li_as = page.xpath(sel_node)
    sqlfile_f = open(sqlfile, 'a')
    for li_a in li_as:
        href = li_a.values()[0]
        title = str(li_a.text_content())
        fast_t = re.findall('.*?[:：]*([\w\d\s!\?\'",\.:;\&-]+)(?:,[\s\d]+)[）；，\)]', li_a.getparent().text_content())

        if valid_link(href):
            #all_names.append((href, title))
            try:
                o = get_orig_name(title, href, file_prefix = file_prefix, disambig_keywd = disambig_keywd)
            except MWException as ex:
                print('ERR:%s (%s)' % (title.decode('utf-8'), ex))#.encode('utf-8'))
                err_names.append(title)
                #raw_input()
            else:
                if not o[0]:
                    err_names.append(title)
                else:
                    #if title not in all_names:
                    #    allnames[title] = []
                    #all_names[title]+=[o[0], o[1]]
                    print(('%s => %s / %s' % (title, o[0], o[1])).encode('gbk', 'replace'))
                    orig = [o[0].strip()]
                    if fast_t:
                         orig.append(fast_t[0].strip())
                    orig = set(orig)
                    chn_name = set([o[1].strip(), title.strip().decode('utf-8')])
                    for _1 in chn_name:
                        for _2 in orig:
                            if _1 == _2:
                                continue
                            if _1 not in name_record:
                                sql_data.append("(\'"+_1.replace("'","\\\'")+"\', \'"+_2.replace("'","\\\'")+"\')")
                                name_record.append(_1)
                            else:
                                print('%s already has a val' % _1)

                    #yield o[0], o[1]
    print('\n'.join(map(lambda x:x.decode('utf-8'), err_names)))

open(sqlfile, 'w').write('INSERT INTO `movie_sug` VALUES ')
make_list(r'./america_tv_list.htm')
make_list(r'./japan_mvi_list.htm', file_prefix = "JPMVI_", disambig_keywd = "电影")
make_list(r'./france_mvi_cat.htm', file_prefix = "FRMVI_", disambig_keywd = "电影", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
# #print valid_link('/w/index.php?title=%E7%BC%BA%E5%B8%AD%E7%9A%84%E4%BA%BA&amp;action=edit&amp;redlink=1')
make_list(r'./america_mvi_list.htm', file_prefix = "USMVI_", disambig_keywd = "电影")
make_list(r'./india_mvi_cat.htm', file_prefix = "INDMVI_", disambig_keywd = "电影", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
make_list(r'./japan_mvi_cat.htm', file_prefix = "JAPMVI_", disambig_keywd = "电影", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")

#print '\n'.join(all_names)
#print hrefs[0].text_content().encode('utf-8')
make_list(r'./eng_tv_cat.htm', file_prefix = "ENTV_", disambig_keywd = "电视剧", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
#make_list(r'z:/disney.htm')
open(sqlfile, 'a').write(','.join(sql_data) + ";")
	#encoding:utf-8
	import httplib2
	import lxml.html as lhtml
	import re
	import os
	import urllib
	import sys
	import gzip
	reload(sys)
	sys.setdefaultencoding('utf-8')

	ht = httplib2.Http()#cache = r"z:/temp/wiki")
	regex_speical = '(?:Wikipedia\|Portal\|Help\|Special\|Category\|Template)(.+)'
	not_right = ['日语写法', '日本电影']
	class MWException(Exception):
	pass

	def get_orig_name(t, l, prefered = 'http://zh.wikipedia.org/zh-cn/', file_prefix = 'USTV_', disambig_keywd = "电视剧"):
	if l.startswith('http:') or l.startswith('https:'):
	try:
	real_keywd = re.findall('(?:wiki/\|w/\|index.php\?title=)([^\&]+)', l)[0]
	except IndexError:
	raise ValueError(" invalid link %s" % l)
	else:
	real_keywd = l

	url = prefered + real_keywd
	file_key = './cache/' + file_prefix + urllib.unquote(real_keywd.replace('/', '#')).decode('utf-8')
	if os.path.exists(file_key):
	ct = gzip.open(file_key, 'rb').read()
	else:
	resp, ct = ht.request(url)
	if resp['status'] == '200':
	with gzip.open(file_key, 'wb') as f:
	f.write(ct)
	else:
	raise MWException('got %s when processing %s' % (resp['status'], url))
	htmldoc = lhtml.fromstring(ct.decode('utf-8'))
	if ct.find('disambigbox')!=-1:#这是一个消歧义页
	dis_s = htmldoc.xpath('//div[@id="mw-content-text"]/ul/li//a')
	for d in dis_s:
	if d.text_content().encode('utf-8').find(disambig_keywd)!=-1 or d.values()[-1].encode('utf-8').find(disambig_keywd)!=-1:
	new_href = d.values()[0].lstrip('/wiki/')
	if new_href.find('action=edit')==-1:
	print('REDIR-DIS:%s' % new_href)
	return get_orig_name(t, new_href, prefered, file_prefix, disambig_keywd)
	raise MWException("Can't define %s" % real_keywd)

	if ct.find('dablink')!=-1:#xxx重定向至此。关于yy，详见zz
	dab_s = htmldoc.xpath('//div[@class="notice dablink"]//a')
	for d in dab_s:
	if d.text_content().encode('utf-8').find(disambig_keywd)!=-1 or d.values()[-1].encode('utf-8').find(disambig_keywd)!=-1:
	new_href = d.values()[0].lstrip('/wiki/')
	if new_href.find('action=edit')==-1:
	print('REDIR-DAB:%s' % new_href)
	return get_orig_name(t, new_href, prefered, file_prefix, disambig_keywd)
	#don't raise exp here 'cause current page may be right
	#raise MWException("Can't define %s" % real_keywd)
	try:
	first_p = htmldoc.xpath('//div[@id="mw-content-text"]/p[position()=1]')[0]
	except IndexError:
	raise MWException("not right page")
	first_p_plain = first_p.text_content().encode('utf-8')
	infobox_title = htmldoc.xpath('//th[1]')
	jp_title = htmldoc.xpath('//span[@lang="ja"]')
	p_title = htmldoc.xpath('//span[@dir="auto"]')[0].text_content().encode('utf-8')
	if p_title.endswith(')'):#has (xxx)
	try:
	p_title = p_title.replace(re.findall('\([^\)]+\)$', p_title)[0], '')
	except IndexError:
	print('not changed')
	#open(r'z:/233.txt','w').write(first_p_plain)
	while True:
	maybe_eng = ''
	eng = re.findall(p_title+ '.?[:：]([\w\d\s!\?\'",\.:;\&-]+)[）；，\)]', first_p_plain)
	if eng and len(eng[0])>1:
	break
	if infobox_title:
	print('try infobox')
	info_eng = infobox_title[0].text_content().split('\n')
	if len(info_eng) == 1:#只有一条可能为中文，试试能不能匹配
	eng = re.findall(info_eng[0].encode('utf-8')+ '.?[:：]([\w\d\s!\?\'",\.:;\&-]+)[）；，\)]', first_p_plain)
	if eng and len(eng[0])>1:
	print('infoxbox->eng match')
	break
	elif info_eng:#>1
	print('infobox == eng')
	eng = [info_eng[-1]]
	break
	if info_eng:
	info_eng = info_eng[0]
	if jp_title:
	eng = [jp_title[0].text_content()]
	break
	if len(info_eng) < 50 and info_eng != t and info_eng != p_title:
	print('infobox maybe')
	#raw_input(info_eng.decode('utf-8').encode('gbk'))
	maybe_eng = [info_eng]
	#break
	#try without title
	eng = re.findall('.?[:：]([\w\d\s!\?\'",\.:;\&-]+)[）；，\)]', first_p_plain)
	if eng and len(eng[0])>1:
	break
	if maybe_eng:
	eng = maybe_eng
	break
	if not eng or len(eng[0])<=1:
	eng = ['']
	break
	#print mw_content_plain.decode('utf-8').encode('gbk')
	#raise MWException("Original name not found for %s" % real_keywd)
	return eng[0], p_title.decode('utf-8')




	def valid_link(link, link_type = 'wiki'):
	if re.findall(regex_speical, link) or not re.findall('/'+link_type + '/', link):
	return False
	else:
	return True

	all_names = {}
	name_record = []
	sql_data = []
	sqlfile = r"z:/233.sql"
	def make_list(idx_f, file_prefix = 'USTV_', disambig_keywd = "电视剧", sel_node = u"//div[@id='mw-content-text']/*/li/a"):
	page = lhtml.fromstring(open(idx_f).read().decode('utf-8'))
	err_names = []
	li_as = page.xpath(sel_node)
	sqlfile_f = open(sqlfile, 'a')
	for li_a in li_as:
	href = li_a.values()[0]
	title = str(li_a.text_content())
	fast_t = re.findall('.?[:：]([\w\d\s!\?\'",\.:;\&-]+)(?:,[\s\d]+)[）；，\)]', li_a.getparent().text_content())

	if valid_link(href):
	#all_names.append((href, title))
	try:
	o = get_orig_name(title, href, file_prefix = file_prefix, disambig_keywd = disambig_keywd)
	except MWException as ex:
	print('ERR:%s (%s)' % (title.decode('utf-8'), ex))#.encode('utf-8'))
	err_names.append(title)
	#raw_input()
	else:
	if not o[0]:
	err_names.append(title)
	else:
	#if title not in all_names:
	# allnames[title] = []
	#all_names[title]+=[o[0], o[1]]
	print(('%s => %s / %s' % (title, o[0], o[1])).encode('gbk', 'replace'))
	orig = [o[0].strip()]
	if fast_t:
	orig.append(fast_t[0].strip())
	orig = set(orig)
	chn_name = set([o[1].strip(), title.strip().decode('utf-8')])
	for _1 in chn_name:
	for _2 in orig:
	if _1 == _2:
	continue
	if _1 not in name_record:
	sql_data.append("(\'"+_1.replace("'","\\\'")+"\', \'"+_2.replace("'","\\\'")+"\')")
	name_record.append(_1)
	else:
	print('%s already has a val' % _1)

	#yield o[0], o[1]
	print('\n'.join(map(lambda x:x.decode('utf-8'), err_names)))

	open(sqlfile, 'w').write('INSERT INTO `movie_sug` VALUES ')
	make_list(r'./america_tv_list.htm')
	make_list(r'./japan_mvi_list.htm', file_prefix = "JPMVI_", disambig_keywd = "电影")
	make_list(r'./france_mvi_cat.htm', file_prefix = "FRMVI_", disambig_keywd = "电影", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
	# #print valid_link('/w/index.php?title=%E7%BC%BA%E5%B8%AD%E7%9A%84%E4%BA%BA&action=edit&redlink=1')
	make_list(r'./america_mvi_list.htm', file_prefix = "USMVI_", disambig_keywd = "电影")
	make_list(r'./india_mvi_cat.htm', file_prefix = "INDMVI_", disambig_keywd = "电影", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
	make_list(r'./japan_mvi_cat.htm', file_prefix = "JAPMVI_", disambig_keywd = "电影", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")

	#print '\n'.join(all_names)
	#print hrefs[0].text_content().encode('utf-8')
	make_list(r'./eng_tv_cat.htm', file_prefix = "ENTV_", disambig_keywd = "电视剧", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
	#make_list(r'z:/disney.htm')
	open(sqlfile, 'a').write(','.join(sql_data) + ";")