Skip to content

Instantly share code, notes, and snippets.

@LinguList
Last active July 14, 2024 11:59
Show Gist options
  • Save LinguList/f12bfd9acff2bec91525e1e6511e5adb to your computer and use it in GitHub Desktop.
Save LinguList/f12bfd9acff2bec91525e1e6511e5adb to your computer and use it in GitHub Desktop.
Converting an Artificial Proto-Language into Data for Testing Computational Approaches in Historical Linguistics (Supplementary Material)

Converting an Artificial Proto-Language into Data for Testing Computational Approaches in Historical Linguistics (Supplementary Material)

The material shared here allows users to test the code described in the original study by List (2024).

List, Johann-Mattis, "Converting an Artificial Proto-Language into Data for Testing Computational Approaches in Historical Linguistics," in Computer-Assisted Language Comparison in Practice, 7.2. URL: https://calc.hypotheses.org/7363, DOI: 10.15475/calcip.2024.2.1.

Installation

Install package dependencies.

pip install lingpy==2.6.13 pypdf==4.2.0 pysem==0.8.0

Download additional data.

git clone https://github.com/sequencecomparison/starostinpie 
cd starostinpie 
git checkout v1.0 
cd ..

Running the Code

Code is in the script proto.py.

python proto.py

NEXUS File

The file nexus.nex contains the data that was used to generate NeighborNet networks with SplitsTree.

Funding

This project has received funding from the European Research Council (ERC) under the European Union's Horizon Europe research and innovation programme (Grant agreement No. 101044282). The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.

#NEXUS
BEGIN CHARACTERS;
concept_16=1-1; [DRY]
concept_91=2-2; [WHO]
concept_81=3-3; [THIS]
concept_80=4-4; [THAT]
concept_3=5-5; [BARK]
concept_69=6-6; [SKIN]
concept_45=7-7; [MAN]
concept_17=8-8; [EAR]
concept_46=9-9; [MANY]
concept_32=10-10; [HEAD]
concept_88=11-11; [WARM]
concept_50=12-12; [NAME]
concept_4=13-13; [BELLY]
concept_40=14-14; [LEAF]
concept_26=15-15; [FLY (MOVE THROUGH AIR)]
concept_24=16-16; [FIRE]
concept_79=17-17; [TAIL]
concept_78=18-18; [SWIM]
concept_41=19-19; [LIE (REST)]
concept_1=20-20; [ALL]
concept_18=21-21; [EARTH (SOIL)]
concept_21=22-22; [EYE]
concept_33=23-23; [HEAR]
concept_64=24-24; [SAY]
concept_77=25-25; [SUN]
concept_10=26-26; [BONE]
concept_27=27-27; [FULL]
concept_44=28-28; [LOUSE]
concept_54=29-29; [NIGHT]
concept_62=30-30; [ROOT]
concept_63=31-31; [SAND]
concept_75=32-32; [STAR]
concept_95=33-33; [YELLOW]
concept_8=34-34; [BLACK]
concept_22=35-35; [FAR]
concept_34=36-36; [HEART]
concept_61=37-37; [ROAD]
concept_83=38-38; [TONGUE]
concept_2=39-39; [ASH]
concept_5=40-40; [BIG]
concept_6=41-41; [BIRD]
concept_7=42-42; [BITE]
concept_9=43-43; [BLOOD]
concept_11=44-44; [BREAST]
concept_12=45-45; [CLOUD]
concept_13=46-46; [COLD]
concept_14=47-47; [COME]
concept_15=48-48; [DIE]
concept_19=49-49; [EAT]
concept_20=50-50; [EGG]
concept_23=51-51; [FEATHER]
concept_25=52-52; [FISH]
concept_28=53-53; [GIVE]
concept_29=54-54; [GOOD]
concept_30=55-55; [GREEN]
concept_31=56-56; [HAIR]
concept_35=57-57; [HEAVY]
concept_36=58-58; [HORN (ANATOMY)]
concept_37=59-59; [I]
concept_38=60-60; [KILL]
concept_39=61-61; [KNEE]
concept_42=62-62; [LIVER]
concept_43=63-63; [LONG]
concept_47=64-64; [MOON]
concept_48=65-65; [MOUNTAIN]
concept_49=66-66; [MOUTH]
concept_51=67-67; [NEAR]
concept_52=68-68; [NECK]
concept_53=69-69; [NEW]
concept_55=70-70; [NOSE]
concept_56=71-71; [NOT]
concept_57=72-72; [ONE]
concept_58=73-73; [PERSON]
concept_59=74-74; [RAIN (PRECIPITATION)]
concept_60=75-75; [RED]
concept_65=76-76; [SEE]
concept_66=77-77; [SEED]
concept_67=78-78; [SHORT]
concept_68=79-79; [SIT]
concept_69=80-80; [SKIN]
concept_70=81-81; [SLEEP]
concept_71=82-82; [SMALL]
concept_72=83-83; [SMOKE (EXHAUST)]
concept_73=84-84; [SNAKE]
concept_74=85-85; [STAND]
concept_76=86-86; [STONE]
concept_82=87-87; [THOU]
concept_84=88-88; [TOOTH]
concept_85=89-89; [TREE]
concept_86=90-90; [TWO]
concept_87=91-91; [WALK]
concept_89=92-92; [WATER]
concept_90=93-93; [WHITE]
concept_92=94-94; [WIND]
concept_93=95-95; [WOMAN]
concept_94=96-96; [WORM]
concept_1=97-103; [ALL]
concept_2=104-111; [ASH]
concept_3=112-120; [BARK]
concept_4=121-128; [BELLY]
concept_5=129-137; [BIG]
concept_6=138-146; [BIRD]
concept_7=147-154; [BITE]
concept_8=155-162; [BLACK]
concept_9=163-168; [BLOOD]
concept_10=169-174; [BONE]
concept_11=175-183; [BREAST]
concept_12=184-193; [CLOUD]
concept_13=194-201; [COLD]
concept_14=202-209; [COME]
concept_15=210-214; [DIE]
concept_16=215-218; [DRY]
concept_17=219-223; [EAR]
concept_18=224-229; [EARTH (SOIL)]
concept_19=230-239; [EAT]
concept_20=240-247; [EGG]
concept_21=248-253; [EYE]
concept_22=254-262; [FAR]
concept_23=263-266; [FEATHER]
concept_24=267-274; [FIRE]
concept_25=275-281; [FISH]
concept_26=282-288; [FLY (MOVE THROUGH AIR)]
concept_27=289-295; [FULL]
concept_28=296-301; [GIVE]
concept_29=302-309; [GOOD]
concept_30=310-315; [GREEN]
concept_31=316-323; [HAIR]
concept_32=324-329; [HEAD]
concept_33=330-337; [HEAR]
concept_34=338-341; [HEART]
concept_35=342-351; [HEAVY]
concept_36=352-356; [HORN (ANATOMY)]
concept_37=357-364; [I]
concept_38=365-375; [KILL]
concept_39=376-381; [KNEE]
concept_40=382-388; [LEAF]
concept_41=389-394; [LIE (REST)]
concept_42=395-400; [LIVER]
concept_43=401-409; [LONG]
concept_44=410-415; [LOUSE]
concept_45=416-421; [MAN]
concept_46=422-428; [MANY]
concept_47=429-434; [MOON]
concept_48=435-441; [MOUNTAIN]
concept_49=442-448; [MOUTH]
concept_50=449-451; [NAME]
concept_51=452-460; [NEAR]
concept_52=461-468; [NECK]
concept_53=469-470; [NEW]
concept_54=471-474; [NIGHT]
concept_55=475-478; [NOSE]
concept_56=479-481; [NOT]
concept_57=482-485; [ONE]
concept_58=486-493; [PERSON]
concept_59=494-501; [RAIN (PRECIPITATION)]
concept_60=502-508; [RED]
concept_61=509-518; [ROAD]
concept_62=519-525; [ROOT]
concept_63=526-532; [SAND]
concept_64=533-540; [SAY]
concept_65=541-546; [SEE]
concept_66=547-551; [SEED]
concept_67=552-556; [SHORT]
concept_68=557-561; [SIT]
concept_69=562-568; [SKIN]
concept_70=569-574; [SLEEP]
concept_71=575-582; [SMALL]
concept_72=583-587; [SMOKE (EXHAUST)]
concept_73=588-598; [SNAKE]
concept_74=599-601; [STAND]
concept_75=602-605; [STAR]
concept_76=606-610; [STONE]
concept_77=611-614; [SUN]
concept_78=615-620; [SWIM]
concept_79=621-631; [TAIL]
concept_80=632-640; [THAT]
concept_81=641-644; [THIS]
concept_82=645-647; [THOU]
concept_83=648-652; [TONGUE]
concept_84=653-655; [TOOTH]
concept_85=656-662; [TREE]
concept_86=663-666; [TWO]
concept_87=667-676; [WALK]
concept_88=677-684; [WARM]
concept_89=685-690; [WATER]
concept_90=691-698; [WHITE]
concept_91=699-704; [WHO]
concept_92=705-708; [WIND]
concept_93=709-716; [WOMAN]
concept_94=717-722; [WORM]
concept_95=723-729; [YELLOW]
END; [CHARACTERS]
BEGIN DATA;
DIMENSIONS NTAX=20 NCHAR=729;
FORMAT DATATYPE=STANDARD SYMBOLS="10" GAP=- MISSING=? INTERLEAVE=yes;
MATRIX
armenian
bulgarian
czech
danish
dutch
english
french 110000?00000001010000000000000000000000000000000000000000000000000000000000000000000?000000000000100010000001000001000000100000010000000000000010001000000000001000000100100000100010000000001000010000000100000010000000010000000100000100000000001000010000000010000000110000000001000000000000010000001000001000000100000000001000100000100000010000000010001000000100000000000010000001000001000010000000100100000000001000??????000010010000010000000010000100001000000000010001000101000100000100100000001000000001000000000001000010000001000010000000010001000110100001000000100010000000001000010010000000000???00010100010000010000000000000000001000010001000010010001000001000001000000100010000000100010000000000000100001000001000000000010
german
greek 000000000000000000?111111000000000000000000000000000000000000000000?00?00000000000000000000000000000000000000100000010000001000000000001010000000000000100000000100000010000010000001000010000000000001001000000000001000100001000000000000010000000010000000000001000010000000100000001000010000000100001000000100000000100000000100001000000000010000001000000001001000000000000001000001000010000??????010000000010000001000001000000001010000000001000100000100000010000????????0110000001???10000000100000001000000010000000000010001001001000000000000000010000100000100010000001000000100000010000010001000000010010000100000000000100000000001000000001001000010001010000000100001000001000000001000000010000001000000100010010000000001000000001
hindi
icelandic
italian 110000?000000010000000000000000010000000000000000000000000000000000000000000000000000000000000000101000001000000100000000100000010000000000000000101000000000100000010000100000110000000010000000000010000100000010000001010000000100000010000000100000100000010000000001110000000001000000000000100000100000000010000100000010000000100000100000010000010000001000000000010100000000000010000010000001000010000100000000000100??????10000001000001000000001000010000100000001000000101000100010010000010000000000100100000000101000000000100000100001000000100000100001010010000010000001000001000000001001000000000010010000000110000010000010000000010000000001001000010010001000001000011000000000000010001000001000000000001000010000001000000000000
norwegian 00110000100011010000000000000??00000?0000000000000000000000000000000000000?000000000000000000000100000010000000100000000100000000100000001000000001000000000100000100000100000100000000010000000010000000100000000100010001000001000000100000001000000001000000000001010000000000110000001000000100000000000110000000100000100000001000001000000000100010000000100001000000001000000000100000100000010000010000010000000010000010000010000000100000010000100000001010000000010000000011000100001010001000000010000000???????????????????????????????10000000010000010001000010000100000010000010000000100000100100000010000101000010001000001000000000000000000000001001000010010000001000100000000010000000010000100000001000001000100000000010001000000
polish
portuguese
Proto
romanian
russian
spanish 110000?00100001010000000000000000000000000000000000000000000000000000000000000000000?000000000000101000001000000100000000100000010000000000110000001000000000100000010000100000110000000010000000010000000100000010000000010000000100000001000000100000100000010000000000110000000000100000000000001000100000000010000100000010000000000000010000010000010000001000000100000000100000000000010000001000001010000000000001000001??????100000010000010000000010000100000000100010000001010000100100100000000001000000011000000001000000010000000000100010000000010001000010100100000100000010000010000000010010000100000???10000100010000010000000010000010000000001001000010010001000001000011000000001000000001000001000000000000001000010001000000001000
swedish 001100001000100000000000000000000000000000000000000000000000000000000000000000000000000000000000100000010000000100000000100000000100000001000000001000000001000000100000100000100000000010000000110000000100000000100010001000001000010000000001000000010000011000000010000001000010000001000000100000000000110000000100000100000001000001000000000101010000000100001000000011000000000100000100000010000010000010000000010000010000010000000100000001000100000010010000000010000000011000100001010001000000010000000100000010000000001000000100000010000000010000010001000010000101000010000010010000100000000100000010000101000010001000000100000000000000000000001001000010010000000100100000000010000000100000100000001000001000100000000010001000000
;
END;
"""
Main code accompanying the study "Converting an Artificial Proto-Language into Data for Testing Computational Approaches in Historical Linguistics" by J.-M. List (2024).
"""
from urllib.request import urlopen
import tempfile
import pathlib
from pypdf import PdfReader
from lingpy import ipa2tokens, Wordlist, LexStat
from pysem import to_concepticon
url = "https://gitlab.com/protolanguage1/protolanguage-supplement-lexicon/-/raw/main/SUPPLEMENTARY_MATERIALS.pdf?inline=false"
pages = {}
with tempfile.TemporaryDirectory() as t:
with urlopen(url) as req:
data = req.read()
path = pathlib.Path(t) / "data.pdf"
with open(path, "wb") as f:
f.write(data)
pdf = PdfReader(path)
for i, page in enumerate(pdf.pages):
pages[i] = page.extract_text()
data = []
for i in range(3):
rows = [row for row in pages[i].split("\n") if
"=" in row]
for row in rows:
word, concepts = row.strip().split("=")
for concept in concepts.strip().split(", "):
mappings = to_concepticon(
[{"gloss": concept}]
)[concept]
value = word.strip()
form = value.replace("γ", "ɣ")
tokens = ipa2tokens(form)
if mappings:
data += [[
"Proto",
concept,
mappings[0][0],
mappings[0][1],
value,
form,
tokens
]]
id2gl = {row[2]: row[3] for row in data}
pie = Wordlist.from_cldf("starostinpie/cldf/cldf-metadata.json")
overlap = set([pie[idx, "concepticon"] for idx in pie])
wln = {0: [
"doculect",
"concept",
"value",
"form",
"tokens",
]}
count = 1
for row in data:
if row[2] in overlap:
wln[count] = [
"Proto",
row[3],
row[4],
row[5],
row[6]]
count += 1
for idx in pie:
if pie[idx, "concepticon"] in id2gl:
concept = id2gl[pie[idx, "concepticon"]]
wln[count] = [
pie[idx, "doculect"],
concept,
pie[idx, "value"],
pie[idx, "form"],
pie[idx, "tokens"]
]
count += 1
lex = LexStat(wln)
lex.cluster(method="sca", ref="cogid", threshold=0.45, cluster_method="upgma")
lex.calculate("tree", tree_calc="upgma")
print(lex.tree.asciiArt())
lex.output("tsv", filename="wordlist", ignore="all", prettify=False,
subset=True,
cols=["doculect", "concept", "value", "form", "tokens", "cogid"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment