LinguList/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Querying Datasets with Cognates in the Lexibank Repository

In order to run the code below (code.py), you need to git-clone the lexibank_analysed package (https://github.com/lexibank/lexibank-analysed) and install it using pip. Then, you need to download the script and place it into the lexibank-analysed folder. Before running, you need to run the command cldfbench download cldfbench_lexibank_analysed.py in order to download all individual datasets. After that, you can just run the script typing python code.py. The output looks as follows:


Dataset
Concepts
Languages
Words
Cognates
Singletons


bdpa
519
538
50095
750
0


blustaustronesian
210
20
4358
321
2409


bowernpny
344
190
44876
4494
25054


cals
184
88
15826
718
181


carvalhopurus
205
4
731
206
0


chaconarawakan
102
8
711
135
141


chaconbaniwa
243
14
2354
298
312


chaconcolumbian
128
69
9030
1192
1316


chacontukanoan
142
16
1542
141
8


constenlachibchan
110
25
2631
429
976


davletshinaztecan
100
9
854
118
83


deepadungpalaung
100
16
1584
184
35


dravlex
100
20
2127
280
511


dunnaslian
146
32
4560
589
1024


dunnielex
207
20
4393
734
1043


felekesemitic
150
21
3120
356
322


galuciotupi
100
23
2258
303
409


gaotb
100
51
5085
480
665


gerarditupi
244
38
7621
557
525


hattorijaponic
200
10
1986
278
182


kesslersignificance
200
8
1600
214
1031


leekoreanic
246
15
2365
300
93


lieberherrkhobwa
100
22
2144
243
67


listsamplesize
550
4
2429
512
1086


lundgrenomagoa
1807
3
3289
398
2138


mcelhanonhuon
140
14
1960
387
562


nagarajakhasian
200
6
1214
217
517


peirosaustroasiatic
100
109
10706
1117
819


pharaocoracholaztecan
100
9
800
150
152


ratcliffearabic
100
14
1340
98
358


robinsonap
398
13
4841
521
2873


saenkoromance
110
43
4853
224
241


sagartst
250
50
12179
1604
3516


savelyevturkic
254
32
8360
660
245


sidwellbahnaric
200
24
4546
530
525


simsrma
233
11
1055
170
15


starostinpie
110
19
2172
306
328


syrjaenenuralic
173
7
1401
252
618


walworthpolynesian
210
31
7518
664
792


wichmannmixezoquean
110
10
1106
199
140


yanglalo
994
8
8505
1212
10


zhivlovobugrian
110
21
2055
203
39


utoaztecan
121
46
5813
680
1362


abvdoceanic
210
417
85560
7248
21076


To cite this study, please refer to the original blogpost in which it was presented (https://calc.hypotheses.org/4872).

  
## code.py
"""
Compute coverage for datasets with cognates.
"""

from cldfbench_lexibank_analysed import Dataset as Lexibank
from lingpy import *
from tabulate import tabulate


partial = [
        "mannburmish",
        "bodtkhobwa",
        "houchinese",
        "liusinitic",
        "tuled",
        ]

columns = [
        'concept_name', 'language_id', 'value', 'form', 'segments',
        'cognacy', "cogid_cognateset_id"]

namespace = (
        ('concept_name', 'concept'),
        ('language_id', 'doculect'),
        ("form", "form"),
        ("value", "value"),
        ('segments', 'tokens'),
        ('cognacy', 'cognacy'),
        ('cogid_cognateset_id', 'cog')
        )

lexibank = Lexibank()
table = []
for row in lexibank.etc_dir.read_csv("lexibank.csv", dicts=True):
    if row["CogCore"] == "x" and row["Dataset"] not in partial:
        pth = lexibank.raw_dir.joinpath(row["Dataset"], "cldf")
        wl = Wordlist.from_cldf(
                pth.joinpath("cldf-metadata.json"),
                columns=columns,
                namespace=namespace
                )
        # workaround for some datasets
        new_id = 1
        for idx, cog in wl.iter_rows("cog"):
            if not cog:
                wl[idx, "cog"] = new_id
                new_id += 1

        # compute etymdict
        etd = wl.get_etymdict(ref="cog")

        # compute singletons
        cognates = len([cog for (cog, idxs) in etd.items() if len(
            [idx for idx in idxs if idx]) > 1])

        table += [
            [row["Dataset"], wl.height, wl.width, len(wl), cognates,
                len(etd)-cognates]
            ]
print(tabulate(
    table,
    headers=["Dataset", "Concepts", "Languages", "Words", "Cognates",
        "Singletons"],
    tablefmt="pipe"
    ))
Dataset	Concepts	Languages	Words	Cognates	Singletons
bdpa	519	538	50095	750	0
blustaustronesian	210	20	4358	321	2409
bowernpny	344	190	44876	4494	25054
cals	184	88	15826	718	181
carvalhopurus	205	4	731	206	0
chaconarawakan	102	8	711	135	141
chaconbaniwa	243	14	2354	298	312
chaconcolumbian	128	69	9030	1192	1316
chacontukanoan	142	16	1542	141	8
constenlachibchan	110	25	2631	429	976
davletshinaztecan	100	9	854	118	83
deepadungpalaung	100	16	1584	184	35
dravlex	100	20	2127	280	511
dunnaslian	146	32	4560	589	1024
dunnielex	207	20	4393	734	1043
felekesemitic	150	21	3120	356	322
galuciotupi	100	23	2258	303	409
gaotb	100	51	5085	480	665
gerarditupi	244	38	7621	557	525
hattorijaponic	200	10	1986	278	182
kesslersignificance	200	8	1600	214	1031
leekoreanic	246	15	2365	300	93
lieberherrkhobwa	100	22	2144	243	67
listsamplesize	550	4	2429	512	1086
lundgrenomagoa	1807	3	3289	398	2138
mcelhanonhuon	140	14	1960	387	562
nagarajakhasian	200	6	1214	217	517
peirosaustroasiatic	100	109	10706	1117	819
pharaocoracholaztecan	100	9	800	150	152
ratcliffearabic	100	14	1340	98	358
robinsonap	398	13	4841	521	2873
saenkoromance	110	43	4853	224	241
sagartst	250	50	12179	1604	3516
savelyevturkic	254	32	8360	660	245
sidwellbahnaric	200	24	4546	530	525
simsrma	233	11	1055	170	15
starostinpie	110	19	2172	306	328
syrjaenenuralic	173	7	1401	252	618
walworthpolynesian	210	31	7518	664	792
wichmannmixezoquean	110	10	1106	199	140
yanglalo	994	8	8505	1212	10
zhivlovobugrian	110	21	2055	203	39
utoaztecan	121	46	5813	680	1362
abvdoceanic	210	417	85560	7248	21076
	"""
	Compute coverage for datasets with cognates.
	"""

	from cldfbench_lexibank_analysed import Dataset as Lexibank
	from lingpy import *
	from tabulate import tabulate


	partial = [
	"mannburmish",
	"bodtkhobwa",
	"houchinese",
	"liusinitic",
	"tuled",
	]

	columns = [
	'concept_name', 'language_id', 'value', 'form', 'segments',
	'cognacy', "cogid_cognateset_id"]

	namespace = (
	('concept_name', 'concept'),
	('language_id', 'doculect'),
	("form", "form"),
	("value", "value"),
	('segments', 'tokens'),
	('cognacy', 'cognacy'),
	('cogid_cognateset_id', 'cog')
	)

	lexibank = Lexibank()
	table = []
	for row in lexibank.etc_dir.read_csv("lexibank.csv", dicts=True):
	if row["CogCore"] == "x" and row["Dataset"] not in partial:
	pth = lexibank.raw_dir.joinpath(row["Dataset"], "cldf")
	wl = Wordlist.from_cldf(
	pth.joinpath("cldf-metadata.json"),
	columns=columns,
	namespace=namespace
	)
	# workaround for some datasets
	new_id = 1
	for idx, cog in wl.iter_rows("cog"):
	if not cog:
	wl[idx, "cog"] = new_id
	new_id += 1

	# compute etymdict
	etd = wl.get_etymdict(ref="cog")

	# compute singletons
	cognates = len([cog for (cog, idxs) in etd.items() if len(
	[idx for idx in idxs if idx]) > 1])

	table += [
	[row["Dataset"], wl.height, wl.width, len(wl), cognates,
	len(etd)-cognates]
	]
	print(tabulate(
	table,
	headers=["Dataset", "Concepts", "Languages", "Words", "Cognates",
	"Singletons"],
	tablefmt="pipe"
	))