Skip to content

Instantly share code, notes, and snippets.

@fffonion
Last active August 29, 2015 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fffonion/e243665248b598362c7f to your computer and use it in GitHub Desktop.
Save fffonion/e243665248b598362c7f to your computer and use it in GitHub Desktop.
get original name of movies
#encoding:utf-8
import httplib2
import lxml.html as lhtml
import re
import os
import urllib
import sys
import gzip
reload(sys)
sys.setdefaultencoding('utf-8')
ht = httplib2.Http()#cache = r"z:/temp/wiki")
regex_speical = '(?:Wikipedia|Portal|Help|Special|Category|Template)(.+)'
not_right = ['日语写法', '日本电影']
class MWException(Exception):
pass
def get_orig_name(t, l, prefered = 'http://zh.wikipedia.org/zh-cn/', file_prefix = 'USTV_', disambig_keywd = "电视剧"):
if l.startswith('http:') or l.startswith('https:'):
try:
real_keywd = re.findall('(?:wiki/|w/|index.php\?title=)([^\&]+)', l)[0]
except IndexError:
raise ValueError(" invalid link %s" % l)
else:
real_keywd = l
url = prefered + real_keywd
file_key = './cache/' + file_prefix + urllib.unquote(real_keywd.replace('/', '#')).decode('utf-8')
if os.path.exists(file_key):
ct = gzip.open(file_key, 'rb').read()
else:
resp, ct = ht.request(url)
if resp['status'] == '200':
with gzip.open(file_key, 'wb') as f:
f.write(ct)
else:
raise MWException('got %s when processing %s' % (resp['status'], url))
htmldoc = lhtml.fromstring(ct.decode('utf-8'))
if ct.find('disambigbox')!=-1:#这是一个消歧义页
dis_s = htmldoc.xpath('//div[@id="mw-content-text"]/ul/li//a')
for d in dis_s:
if d.text_content().encode('utf-8').find(disambig_keywd)!=-1 or d.values()[-1].encode('utf-8').find(disambig_keywd)!=-1:
new_href = d.values()[0].lstrip('/wiki/')
if new_href.find('action=edit')==-1:
print('REDIR-DIS:%s' % new_href)
return get_orig_name(t, new_href, prefered, file_prefix, disambig_keywd)
raise MWException("Can't define %s" % real_keywd)
if ct.find('dablink')!=-1:#xxx重定向至此。关于yy,详见zz
dab_s = htmldoc.xpath('//div[@class="notice dablink"]//a')
for d in dab_s:
if d.text_content().encode('utf-8').find(disambig_keywd)!=-1 or d.values()[-1].encode('utf-8').find(disambig_keywd)!=-1:
new_href = d.values()[0].lstrip('/wiki/')
if new_href.find('action=edit')==-1:
print('REDIR-DAB:%s' % new_href)
return get_orig_name(t, new_href, prefered, file_prefix, disambig_keywd)
#don't raise exp here 'cause current page may be right
#raise MWException("Can't define %s" % real_keywd)
try:
first_p = htmldoc.xpath('//div[@id="mw-content-text"]/p[position()=1]')[0]
except IndexError:
raise MWException("not right page")
first_p_plain = first_p.text_content().encode('utf-8')
infobox_title = htmldoc.xpath('//th[1]')
jp_title = htmldoc.xpath('//span[@lang="ja"]')
p_title = htmldoc.xpath('//span[@dir="auto"]')[0].text_content().encode('utf-8')
if p_title.endswith(')'):#has (xxx)
try:
p_title = p_title.replace(re.findall('\([^\)]+\)$', p_title)[0], '')
except IndexError:
print('not changed')
#open(r'z:/233.txt','w').write(first_p_plain)
while True:
maybe_eng = ''
eng = re.findall(p_title+ '.*?[::]*([\w\d\s!\?\'",\.:;\&-]+)[);,\)]', first_p_plain)
if eng and len(eng[0])>1:
break
if infobox_title:
print('try infobox')
info_eng = infobox_title[0].text_content().split('\n')
if len(info_eng) == 1:#只有一条可能为中文,试试能不能匹配
eng = re.findall(info_eng[0].encode('utf-8')+ '.*?[::]*([\w\d\s!\?\'",\.:;\&-]+)[);,\)]', first_p_plain)
if eng and len(eng[0])>1:
print('infoxbox->eng match')
break
elif info_eng:#>1
print('infobox == eng')
eng = [info_eng[-1]]
break
if info_eng:
info_eng = info_eng[0]
if jp_title:
eng = [jp_title[0].text_content()]
break
if len(info_eng) < 50 and info_eng != t and info_eng != p_title:
print('infobox maybe')
#raw_input(info_eng.decode('utf-8').encode('gbk'))
maybe_eng = [info_eng]
#break
#try without title
eng = re.findall('.*?[::]*([\w\d\s!\?\'",\.:;\&-]+)[);,\)]', first_p_plain)
if eng and len(eng[0])>1:
break
if maybe_eng:
eng = maybe_eng
break
if not eng or len(eng[0])<=1:
eng = ['']
break
#print mw_content_plain.decode('utf-8').encode('gbk')
#raise MWException("Original name not found for %s" % real_keywd)
return eng[0], p_title.decode('utf-8')
def valid_link(link, link_type = 'wiki'):
if re.findall(regex_speical, link) or not re.findall('/'+link_type + '/', link):
return False
else:
return True
all_names = {}
name_record = []
sql_data = []
sqlfile = r"z:/233.sql"
def make_list(idx_f, file_prefix = 'USTV_', disambig_keywd = "电视剧", sel_node = u"//div[@id='mw-content-text']/*/li/a"):
page = lhtml.fromstring(open(idx_f).read().decode('utf-8'))
err_names = []
li_as = page.xpath(sel_node)
sqlfile_f = open(sqlfile, 'a')
for li_a in li_as:
href = li_a.values()[0]
title = str(li_a.text_content())
fast_t = re.findall('.*?[::]*([\w\d\s!\?\'",\.:;\&-]+)(?:,[\s\d]+)[);,\)]', li_a.getparent().text_content())
if valid_link(href):
#all_names.append((href, title))
try:
o = get_orig_name(title, href, file_prefix = file_prefix, disambig_keywd = disambig_keywd)
except MWException as ex:
print('ERR:%s (%s)' % (title.decode('utf-8'), ex))#.encode('utf-8'))
err_names.append(title)
#raw_input()
else:
if not o[0]:
err_names.append(title)
else:
#if title not in all_names:
# allnames[title] = []
#all_names[title]+=[o[0], o[1]]
print(('%s => %s / %s' % (title, o[0], o[1])).encode('gbk', 'replace'))
orig = [o[0].strip()]
if fast_t:
orig.append(fast_t[0].strip())
orig = set(orig)
chn_name = set([o[1].strip(), title.strip().decode('utf-8')])
for _1 in chn_name:
for _2 in orig:
if _1 == _2:
continue
if _1 not in name_record:
sql_data.append("(\'"+_1.replace("'","\\\'")+"\', \'"+_2.replace("'","\\\'")+"\')")
name_record.append(_1)
else:
print('%s already has a val' % _1)
#yield o[0], o[1]
print('\n'.join(map(lambda x:x.decode('utf-8'), err_names)))
open(sqlfile, 'w').write('INSERT INTO `movie_sug` VALUES ')
make_list(r'./america_tv_list.htm')
make_list(r'./japan_mvi_list.htm', file_prefix = "JPMVI_", disambig_keywd = "电影")
make_list(r'./france_mvi_cat.htm', file_prefix = "FRMVI_", disambig_keywd = "电影", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
# #print valid_link('/w/index.php?title=%E7%BC%BA%E5%B8%AD%E7%9A%84%E4%BA%BA&amp;action=edit&amp;redlink=1')
make_list(r'./america_mvi_list.htm', file_prefix = "USMVI_", disambig_keywd = "电影")
make_list(r'./india_mvi_cat.htm', file_prefix = "INDMVI_", disambig_keywd = "电影", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
make_list(r'./japan_mvi_cat.htm', file_prefix = "JAPMVI_", disambig_keywd = "电影", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
#print '\n'.join(all_names)
#print hrefs[0].text_content().encode('utf-8')
make_list(r'./eng_tv_cat.htm', file_prefix = "ENTV_", disambig_keywd = "电视剧", sel_node = u"//div[@id='mw-pages']/div[@class='mw-content-ltr']//li/a")
#make_list(r'z:/disney.htm')
open(sqlfile, 'a').write(','.join(sql_data) + ";")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment