Merging datasets with LingPy and the CLDF curation framework
This gist provides the code in one file, underlying the blog post Merging datasets with LingPy and the CLDF curation framework.
This gist provides the code in one file, underlying the blog post Merging datasets with LingPy and the CLDF curation framework.
from lingpy import * | |
from lexibank_chaconarawakan import Dataset as ds1 | |
from lexibank_chaconbaniwa import Dataset as ds2 | |
from pyconcepticon.api import Concepticon | |
wl1 = Wordlist(ds1().raw.joinpath("arawakan_swadesh_100_edictor.tsv").as_posix()) | |
wl2 = Wordlist(ds2().raw.joinpath("Bruzzi_Granadillo.txt").as_posix()) | |
swad = [c.concepticon_id for c in | |
Concepticon().conceptlists['Swadesh-1955-100'].concepts.values()] | |
concepts = {wl2[idx, 'concept'] for idx in wl2 if wl2[idx, 'concepticon_id'] in | |
swad} | |
D = {0: wl1.columns+['old_idx', 'cogx']} | |
nidx = 1 | |
for idx in wl1: | |
D[nidx] = [wl1[idx, h] for h in wl1.columns] + ['chaconarawakan-'+str(idx)] | |
D[nidx] += ['chaconarawakan-'+str(wl1[idx, 'cogid'])] | |
nidx += 1 | |
for idx in wl2: | |
D[nidx] = [wl2[idx, h] for h in wl1.columns] + ['chaconbaniwa-'+str(idx)] | |
D[nidx] += ['chaconbaniwa-'+str(wl2[idx, 'cogid'])] | |
nidx += 1 | |
wl = Wordlist(D) | |
wl.renumber('cogx', 'cogid', override=True) | |
for idx, ipa, segments in wl.iter_rows('ipa', 'segments'): | |
if not segments: | |
wl[idx, 'segments'] = ipa2tokens(ipa) | |
wl.output('tsv', filename='chacon-arawakan-baniwa.tsv', subset=True, cols=[ | |
c for c in wl.columns if c not in ['value_in_source']], | |
rows=dict(concept = 'in '+str(concepts)) | |
) |
lingpy >= 2.6.4 | |
-e git+https://github.com/lexibank/chaconarawakan.git@v1.0.1#egg=lexibank_chaconarawakan | |
-e git+https://github.com/lexibank/chaconbaniwa.git@v1.0.0#egg=lexibank_chaconbaniwa |