Skip to content

Instantly share code, notes, and snippets.

@LinguList
Created December 11, 2018 12:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LinguList/a217baf24023358a3e541040017bb503 to your computer and use it in GitHub Desktop.
Save LinguList/a217baf24023358a3e541040017bb503 to your computer and use it in GitHub Desktop.
Merging datasets with LingPy and the CLDF curation framework
from lingpy import *
from lexibank_chaconarawakan import Dataset as ds1
from lexibank_chaconbaniwa import Dataset as ds2
from pyconcepticon.api import Concepticon
wl1 = Wordlist(ds1().raw.joinpath("arawakan_swadesh_100_edictor.tsv").as_posix())
wl2 = Wordlist(ds2().raw.joinpath("Bruzzi_Granadillo.txt").as_posix())
swad = [c.concepticon_id for c in
Concepticon().conceptlists['Swadesh-1955-100'].concepts.values()]
concepts = {wl2[idx, 'concept'] for idx in wl2 if wl2[idx, 'concepticon_id'] in
swad}
D = {0: wl1.columns+['old_idx', 'cogx']}
nidx = 1
for idx in wl1:
D[nidx] = [wl1[idx, h] for h in wl1.columns] + ['chaconarawakan-'+str(idx)]
D[nidx] += ['chaconarawakan-'+str(wl1[idx, 'cogid'])]
nidx += 1
for idx in wl2:
D[nidx] = [wl2[idx, h] for h in wl1.columns] + ['chaconbaniwa-'+str(idx)]
D[nidx] += ['chaconbaniwa-'+str(wl2[idx, 'cogid'])]
nidx += 1
wl = Wordlist(D)
wl.renumber('cogx', 'cogid', override=True)
for idx, ipa, segments in wl.iter_rows('ipa', 'segments'):
if not segments:
wl[idx, 'segments'] = ipa2tokens(ipa)
wl.output('tsv', filename='chacon-arawakan-baniwa.tsv', subset=True, cols=[
c for c in wl.columns if c not in ['value_in_source']],
rows=dict(concept = 'in '+str(concepts))
)
lingpy >= 2.6.4
-e git+https://github.com/lexibank/chaconarawakan.git@v1.0.1#egg=lexibank_chaconarawakan
-e git+https://github.com/lexibank/chaconbaniwa.git@v1.0.0#egg=lexibank_chaconbaniwa
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment