Skip to content

Instantly share code, notes, and snippets.



Created Dec 11, 2018
What would you like to do?
Merging datasets with LingPy and the CLDF curation framework
from lingpy import *
from lexibank_chaconarawakan import Dataset as ds1
from lexibank_chaconbaniwa import Dataset as ds2
from pyconcepticon.api import Concepticon
wl1 = Wordlist(ds1().raw.joinpath("arawakan_swadesh_100_edictor.tsv").as_posix())
wl2 = Wordlist(ds2().raw.joinpath("Bruzzi_Granadillo.txt").as_posix())
swad = [c.concepticon_id for c in
concepts = {wl2[idx, 'concept'] for idx in wl2 if wl2[idx, 'concepticon_id'] in
D = {0: wl1.columns+['old_idx', 'cogx']}
nidx = 1
for idx in wl1:
D[nidx] = [wl1[idx, h] for h in wl1.columns] + ['chaconarawakan-'+str(idx)]
D[nidx] += ['chaconarawakan-'+str(wl1[idx, 'cogid'])]
nidx += 1
for idx in wl2:
D[nidx] = [wl2[idx, h] for h in wl1.columns] + ['chaconbaniwa-'+str(idx)]
D[nidx] += ['chaconbaniwa-'+str(wl2[idx, 'cogid'])]
nidx += 1
wl = Wordlist(D)
wl.renumber('cogx', 'cogid', override=True)
for idx, ipa, segments in wl.iter_rows('ipa', 'segments'):
if not segments:
wl[idx, 'segments'] = ipa2tokens(ipa)
wl.output('tsv', filename='chacon-arawakan-baniwa.tsv', subset=True, cols=[
c for c in wl.columns if c not in ['value_in_source']],
rows=dict(concept = 'in '+str(concepts))
lingpy >= 2.6.4
-e git+
-e git+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment