This gist provides the code in one file, underlying the blog post Merging datasets with LingPy and the CLDF curation framework.
Created
December 11, 2018 12:22
-
-
Save LinguList/a217baf24023358a3e541040017bb503 to your computer and use it in GitHub Desktop.
Merging datasets with LingPy and the CLDF curation framework
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lingpy import * | |
from lexibank_chaconarawakan import Dataset as ds1 | |
from lexibank_chaconbaniwa import Dataset as ds2 | |
from pyconcepticon.api import Concepticon | |
wl1 = Wordlist(ds1().raw.joinpath("arawakan_swadesh_100_edictor.tsv").as_posix()) | |
wl2 = Wordlist(ds2().raw.joinpath("Bruzzi_Granadillo.txt").as_posix()) | |
swad = [c.concepticon_id for c in | |
Concepticon().conceptlists['Swadesh-1955-100'].concepts.values()] | |
concepts = {wl2[idx, 'concept'] for idx in wl2 if wl2[idx, 'concepticon_id'] in | |
swad} | |
D = {0: wl1.columns+['old_idx', 'cogx']} | |
nidx = 1 | |
for idx in wl1: | |
D[nidx] = [wl1[idx, h] for h in wl1.columns] + ['chaconarawakan-'+str(idx)] | |
D[nidx] += ['chaconarawakan-'+str(wl1[idx, 'cogid'])] | |
nidx += 1 | |
for idx in wl2: | |
D[nidx] = [wl2[idx, h] for h in wl1.columns] + ['chaconbaniwa-'+str(idx)] | |
D[nidx] += ['chaconbaniwa-'+str(wl2[idx, 'cogid'])] | |
nidx += 1 | |
wl = Wordlist(D) | |
wl.renumber('cogx', 'cogid', override=True) | |
for idx, ipa, segments in wl.iter_rows('ipa', 'segments'): | |
if not segments: | |
wl[idx, 'segments'] = ipa2tokens(ipa) | |
wl.output('tsv', filename='chacon-arawakan-baniwa.tsv', subset=True, cols=[ | |
c for c in wl.columns if c not in ['value_in_source']], | |
rows=dict(concept = 'in '+str(concepts)) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lingpy >= 2.6.4 | |
-e git+https://github.com/lexibank/chaconarawakan.git@v1.0.1#egg=lexibank_chaconarawakan | |
-e git+https://github.com/lexibank/chaconbaniwa.git@v1.0.0#egg=lexibank_chaconbaniwa |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment