Skip to content

Instantly share code, notes, and snippets.

@LinguList
Created November 6, 2018 10:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LinguList/1056960125ca79428b420257fa4b02eb to your computer and use it in GitHub Desktop.
Save LinguList/1056960125ca79428b420257fa4b02eb to your computer and use it in GitHub Desktop.
Inferring consonant clusters from CLICS data with LingPy: Data and Code

Inferring consonant clusters from CLICS data with LingPy: Data and Code

This GIST accompanies the blogpost explaining the code, which you can finde here.

To install and run the code, run the following in your terminal:

$ pip install -r pip-requirements.txt
$ git clone https://github.com/clld/concepticon-data.git
$ cd concepticon-data
$ python setup.py develop
$ cd ..
$ git clone https://github.com/clld/glottolog.git
$ cd glottolog
$ python setup.py develop
$ cd ..
$ clics load concepticon/ glottolog/
$ python code.py
from lingpy import *
from pyclics.api import Clics
from pyclics.models import Form
from tqdm import tqdm
from collections import defaultdict
def iter_wordlists(db, varieties):
languages = {
(v.source, v.id
): v for v in varieties}
for (dsid, vid), v in sorted(
languages.items()):
forms = [Form(*row) for row in db.fetchall("""
select
f.id, f.dataset_id, f.form, f.segments,
p.name, p.concepticon_id, p.concepticon_gloss, p.ontological_category,
p.semantic_field
from
formtable as f, parametertable as p
where
f.parameter_id = p.id
and f.dataset_id = p.dataset_id
and p.concepticon_id is not null
and f.language_id = ?
and f.dataset_id = ?
order by
f.dataset_id, f.language_id, p.concepticon_id
""", params=(vid, dsid))]
assert forms
yield v, forms
def get_clusters(tokens, prostring):
clusters = ['']
for t, c in zip(tokens, prostring):
if c == 'C':
if clusters[-1].startswith('<'):
clusters[-1] += ' '+t
else:
clusters += ['</ '+t]
elif c == 'c':
if clusters[-1].startswith('>'):
clusters[-1] += ' '+t
else:
clusters += ['>/ '+t]
else:
clusters += ['']
return [x for x in clusters if x]
clics = Clics('.')
D, idx = {0:
[
'doculect',
'concept',
'segments',
'cv'
]}, 0
clusters = defaultdict(
lambda : defaultdict(int))
varieties = clics.db.varieties
print('[i] loaded clics varieties')
for v, forms in tqdm(
iter_wordlists(
clics.db,
varieties
),
total=len(varieties)
):
for form in forms:
idx += 1
clics_form = form.clics_form.strip()
if clics_form:
try:
tokens = clics_form.split()
prostring = prosodic_string(
tokens,
_output='CcV'
)
D[idx] = [
v.gid,
form.concepticon_id,
clics_form,
prostring
]
except ValueError:
pass
clrs = get_clusters(
tokens, prostring
)
for clr in clrs:
clusters[clr, len(
clr.split())][v.gid.split('-')[-1]] += 1
wl = Wordlist(D)
wl.output(
'tsv',
filename='cv-patterns',
ignore='all',
prettify=False
)
with open('cv-clusters.tsv', 'w') as f:
f.write('Cluster\tLength\tFrequency\n')
for (cluster, length), rest in sorted(
clusters.items(),
key=lambda x: len(x[1]),
reverse=True
):
f.write('{0}\t{1}\t{2}\n'.format(
cluster, length, len(rest))
)
-e git+https://github.com/lingpy/lingpy.git@v2.6.4.alpha#egg=lingpy
-e git+https://github.com/clics/clics2.git@v1.1.1#egg=pyclics
-e git+https://github.com/lexibank/allenbai.git@v1.0#egg=lexibank_allenbai
-e git+https://github.com/lexibank/bantubvd.git@v1.0#egg=lexibank_bantubvd
-e git+https://github.com/lexibank/beidasinitic.git@v2.0#egg=lexibank_beidasinitic
-e git+https://github.com/lexibank/bowernpny.git@v1.1.1#egg=lexibank_bowernpny
-e git+https://github.com/lexibank/hubercolumbian.git@v1.0#egg=lexibank_hubercolumbian
-e git+https://github.com/lexibank/ids.git@v1.2#egg=lexibank_ids
-e git+https://github.com/lexibank/kraftchadic.git@v1.0#egg=lexibank_kraftchadic
-e git+https://github.com/lexibank/northeuralex.git@v1.0#egg=lexibank_northeuralex
-e git+https://github.com/lexibank/robinsonap.git@v1.1#egg=lexibank_robinsonap
-e git+https://github.com/lexibank/satterthwaitetb.git@v1.0#egg=lexibank_satterthwaitetb
-e git+https://github.com/lexibank/suntb.git@v1.1#egg=lexibank_suntb
-e git+https://github.com/lexibank/tls.git@v1.1#egg=lexibank_tls
-e git+https://github.com/lexibank/tryonsolomon.git@v1.0.1#egg=lexibank_tryonsolomon
-e git+https://github.com/lexibank/wold.git@v1.1#egg=lexibank_wold
-e git+https://github.com/lexibank/zgraggenmadang.git@v1.1#egg=lexibank_zgraggenmadang
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment