Skip to content

Instantly share code, notes, and snippets.

@erickguan
Last active January 24, 2018 11:16
Show Gist options
  • Save erickguan/7995fac077e48600debfac92a293c345 to your computer and use it in GitHub Desktop.
Save erickguan/7995fac077e48600debfac92a293c345 to your computer and use it in GitHub Desktop.
Import parts of CBDB into wikidata.
#%%
from datetime import date
from sqlalchemy import create_engine
import pywikibot
from pprint import pprint
from operator import is_not
from functools import partial
from concurrent.futures import ThreadPoolExecutor
from SPARQLWrapper import SPARQLWrapper, JSON
import sys
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery('''PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX bd: <http://www.bigdata.com/rdf#>
SELECT ?item ?value WHERE {
?item wdt:P497 ?value
}''')
sparql.setReturnFormat(JSON)
json = sparql.query().convert()
cdbd_ids = dict(map(lambda x: (x['value']['value'], x['item']['value']), json['results']['bindings']))
pprint(len(cdbd_ids))
STATUS_CODE_TO_PROPERTY = {
'2': 'Q11545923',
'3': 'Q1294787',
'4': 'Q11063',
'7': 'Q854979',
'8': 'Q854997',
'9': 'Q3303330',
'26': 'Q39018',
'27': 'Q7723211',
'29': 'Q179294',
'31': 'Q45352519',
'33': 'Q131512',
'35': 'Q220098',
'36': 'Q844586',
'37': 'Q15954519',
'43': 'Q903422',
'48': 'Q1097498',
'56': 'Q1062083',
'57': 'Q170790',
'59': 'Q215536',
'60': 'Q1688932',
'65': 'Q16744001',
'71': 'Q1028181',
'72': 'Q39631',
'74': 'Q13219330',
'75': 'Q1125062',
'76': 'Q2303143',
'86': 'Q12773225',
'89': 'Q48282',
'94': 'Q37226',
'114': 'Q49757',
'142': 'Q201788',
'143': 'Q1126160',
'165': 'Q36180',
'166': 'Q1437754',
'180': 'Q20826540',
'182': 'Q14467526',
'184': 'Q3243461',
'235': 'Q45353837',
'236': 'Q45353897'
}
# LIMIT = (33236, 366589) #366589
LIMIT = (int(sys.argv[1]), int(sys.argv[2]))
eng = create_engine('sqlite:///20170424CBDBauUserSqlite.db')
conn = eng.connect()
biog = conn.execute(f'''SELECT * FROM BIOG_MAIN WHERE c_personid >= {LIMIT[0]} AND c_personid < {LIMIT[1]} ORDER BY c_personid ASC;''')
s = pywikibot.Site('wikidata', 'wikidata')
executor = ThreadPoolExecutor(max_workers=2)
def create_item(row, eng):
conn = eng.connect()
c_personid = row['c_personid']
print(c_personid)
eng_name = row['c_name']
cht_name = row['c_name_chn']
if len(set('()氏妻') & set(cht_name)) == 0:
pass
aliases_eng = [] + [row['c_mingzi_rm'], row['c_name_rm']]
aliases_cht = []
for row in conn.execute(f'''SELECT * FROM ALTNAME_DATA WHERE c_personid={c_personid};'''):
alt_eng = row['c_alt_name']
alt_cht = row['c_alt_name_chn']
aliases_eng += [alt_eng]
aliases_cht += [alt_cht]
aliases_eng = list(filter(partial(is_not, None), aliases_eng))
aliases_cht = list(filter(partial(is_not, None), aliases_cht))
if eng_name is None and len(aliases_eng) > 0:
eng_name = aliases_eng.pop()
if cht_name is None and len(aliases_cht) > 0:
cht_name = aliases_cht.pop()
rjust_personid = str(c_personid).rjust(7, '0')
if cdbd_ids.get(rjust_personid) is None:
item = pywikibot.ItemPage(s)
data = {'aliases': {},
'labels': {},
'claims': {}}
label_eng = None
label_cht = None
ali_eng = None
ali_cht = None
else:
item = pywikibot.ItemPage.from_entity_uri(s, cdbd_ids[rjust_personid])
data = item.get()
data = {
'aliases': data['aliases'],
'labels': data['labels'],
'claims': data['claims']
}
label_eng = item.labels.get('en')
label_cht = item.labels.get('zh-hant') or item.labels.get('zh') or item.labels.get('zh-hans')
ali_eng = item.aliases.get('en')
ali_cht = item.aliases.get('zh-hant') or item.aliases.get('zh') or item.aliases.get('zh-hans')
if eng_name is not None:
if label_eng is None:
data['labels']['en'] = eng_name
else:
aliases_eng.insert(0, eng_name)
if cht_name is not None:
if label_cht is None:
data['labels']['zh-hant'] = cht_name
else:
aliases_cht.insert(0, cht_name)
if data['aliases'].get('en') is None:
data['aliases']['en'] = []
if data['aliases'].get('zh-hant') is None:
data['aliases']['zh-hant'] = []
data['aliases']['en'] = list(set(data['aliases']['en']) | set(aliases_eng))
data['aliases']['zh-hant'] = list(set(data['aliases']['zh-hant']) | set(aliases_cht))
if len(data['aliases']['en']) == 0:
del data['aliases']['en']
if len(data['aliases']['zh-hant']) == 0:
del data['aliases']['zh-hant']
pprint(data)
claim_cbdb = data['claims'].get('P497')
if claim_cbdb is not None:
del data['claims']
item.editEntity(data, summary="Imported from CDBD [[Wikidata:Data_Import_Hub#CBDB]]")
else:
item.editEntity(data, summary="Imported from CDBD [[Wikidata:Data_Import_Hub#CBDB]]")
claim = pywikibot.Claim(s, 'P497')
claim.setTarget(rjust_personid)
item.addClaim(claim)
human_claim = pywikibot.Claim(s, 'P31')
human_claim.setTarget(pywikibot.ItemPage(s, 'Q5'))
item.addClaim(human_claim)
for row in conn.execute(f'''SELECT * FROM STATUS_CODES c, STATUS_DATA d WHERE c.c_status_code = d.c_status_code AND d.c_status_code>0 AND d.c_personid={c_personid};'''):
key = str(int(row['c_status_code']))
if STATUS_CODE_TO_PROPERTY.get(key) is None:
continue
occu_claim = pywikibot.Claim(s, 'P106')
occu_claim.setTarget(pywikibot.ItemPage(s, STATUS_CODE_TO_PROPERTY[key]))
item.addClaim(occu_claim)
for row in biog:
future = executor.submit(create_item, row, eng)
print(future.result())
#%%
import pywikibot
from pprint import pprint
s = pywikibot.Site('wikidata', 'wikidata')
pprint(pywikibot.ItemPage(s, 'Q720').get()['claims']['P497'][0])
# Claim.fromJSON(DataSite("wikidata", "wikidata"), {'mainsnak': {'snaktype': 'value', 'property': 'P497', 'datatype': 'external-id', 'datavalue': {'value': '0029239', 'type': 'string'}}, 'type': 'statement', 'id': 'Q720$4C93CED5-F403-45DD-B270-AA9D6AD76AB7', 'rank': 'normal'})
claim_p = pywikibot.Claim(s, 'P497')
claim_p.setTarget('1111111')
pprint(claim_p)
#%%
import pywikibot
from pprint import pprint
s = pywikibot.Site('test', 'wikidata')
pprint(pywikibot.ItemPage(s, 'Q110490').get())
#%%
import pywikibot
from pprint import pprint
s = pywikibot.Site('wikidata', 'wikidata')
pprint(pywikibot.ItemPage.from_entity_uri(s, 'http://www.wikidata.org/entity/Q11109043').get())
#%%
import pywikibot
from pprint import pprint
s = pywikibot.Site('test', 'wikidata')
p = pywikibot.ItemPage(s)
p.editEntity({'labels': {'en': 'test2'}})
pprint(p.get())
#%%
import pywikibot
from pprint import pprint
s = pywikibot.Site('wikidata', 'wikidata')
pprint(pywikibot.PropertyPage(s, 'P497').get())
claim_p = pywikibot.Claim(s, 'P497')
claim_p.setTarget('1111111')
pprint(claim_p)
#%%
a = pywikibot.data.api.Request()
print(a)
#%%
from SPARQLWrapper import SPARQLWrapper, JSON
import pickle
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery('''PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX bd: <http://www.bigdata.com/rdf#>
SELECT ?item ?value WHERE {
?item wdt:P497 ?value
}''')
sparql.setReturnFormat(JSON)
json = sparql.query().convert()
cdbd_ids = dict(map(lambda x: (x['value']['value'], x['item']['value']), json['results']['bindings']))
with open('dump_cdbd_mapping', 'wb') as f:
pickle.dump(cdbd_ids, f)
#%%
import pickle
with open('dump_cdbd_mapping', 'rb') as f:
cdbd_ids = pickle.load(f)
print(cdbd_ids)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment