Skip to content

Instantly share code, notes, and snippets.

@LinguList
Last active July 14, 2024 11:59
Show Gist options
  • Save LinguList/f12bfd9acff2bec91525e1e6511e5adb to your computer and use it in GitHub Desktop.
Save LinguList/f12bfd9acff2bec91525e1e6511e5adb to your computer and use it in GitHub Desktop.
Converting an Artificial Proto-Language into Data for Testing Computational Approaches in Historical Linguistics (Supplementary Material)

Converting an Artificial Proto-Language into Data for Testing Computational Approaches in Historical Linguistics (Supplementary Material)

The material shared here allows users to test the code described in the original study by List (2024).

List, Johann-Mattis, "Converting an Artificial Proto-Language into Data for Testing Computational Approaches in Historical Linguistics," in Computer-Assisted Language Comparison in Practice, 7.2. URL: https://calc.hypotheses.org/7363, DOI: 10.15475/calcip.2024.2.1.

Installation

Install package dependencies.

pip install lingpy==2.6.13 pypdf==4.2.0 pysem==0.8.0

Download additional data.

git clone https://github.com/sequencecomparison/starostinpie 
cd starostinpie 
git checkout v1.0 
cd ..

Running the Code

Code is in the script proto.py.

python proto.py

NEXUS File

The file nexus.nex contains the data that was used to generate NeighborNet networks with SplitsTree.

Funding

This project has received funding from the European Research Council (ERC) under the European Union's Horizon Europe research and innovation programme (Grant agreement No. 101044282). The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.

#NEXUS
BEGIN CHARACTERS;
concept_16=1-1; [DRY]
concept_91=2-2; [WHO]
concept_81=3-3; [THIS]
concept_80=4-4; [THAT]
concept_3=5-5; [BARK]
concept_69=6-6; [SKIN]
concept_45=7-7; [MAN]
concept_17=8-8; [EAR]
concept_46=9-9; [MANY]
concept_32=10-10; [HEAD]
concept_88=11-11; [WARM]
concept_50=12-12; [NAME]
concept_4=13-13; [BELLY]
concept_40=14-14; [LEAF]
concept_26=15-15; [FLY (MOVE THROUGH AIR)]
concept_24=16-16; [FIRE]
concept_79=17-17; [TAIL]
concept_78=18-18; [SWIM]
concept_41=19-19; [LIE (REST)]
concept_1=20-20; [ALL]
concept_18=21-21; [EARTH (SOIL)]
concept_21=22-22; [EYE]
concept_33=23-23; [HEAR]
concept_64=24-24; [SAY]
concept_77=25-25; [SUN]
concept_10=26-26; [BONE]
concept_27=27-27; [FULL]
concept_44=28-28; [LOUSE]
concept_54=29-29; [NIGHT]
concept_62=30-30; [ROOT]
concept_63=31-31; [SAND]
concept_75=32-32; [STAR]
concept_95=33-33; [YELLOW]
concept_8=34-34; [BLACK]
concept_22=35-35; [FAR]
concept_34=36-36; [HEART]
concept_61=37-37; [ROAD]
concept_83=38-38; [TONGUE]
concept_2=39-39; [ASH]
concept_5=40-40; [BIG]
concept_6=41-41; [BIRD]
concept_7=42-42; [BITE]
concept_9=43-43; [BLOOD]
concept_11=44-44; [BREAST]
concept_12=45-45; [CLOUD]
concept_13=46-46; [COLD]
concept_14=47-47; [COME]
concept_15=48-48; [DIE]
concept_19=49-49; [EAT]
concept_20=50-50; [EGG]
concept_23=51-51; [FEATHER]
concept_25=52-52; [FISH]
concept_28=53-53; [GIVE]
concept_29=54-54; [GOOD]
concept_30=55-55; [GREEN]
concept_31=56-56; [HAIR]
concept_35=57-57; [HEAVY]
concept_36=58-58; [HORN (ANATOMY)]
concept_37=59-59; [I]
concept_38=60-60; [KILL]
concept_39=61-61; [KNEE]
concept_42=62-62; [LIVER]
concept_43=63-63; [LONG]
concept_47=64-64; [MOON]
concept_48=65-65; [MOUNTAIN]
concept_49=66-66; [MOUTH]
concept_51=67-67; [NEAR]
concept_52=68-68; [NECK]
concept_53=69-69; [NEW]
concept_55=70-70; [NOSE]
concept_56=71-71; [NOT]
concept_57=72-72; [ONE]
concept_58=73-73; [PERSON]
concept_59=74-74; [RAIN (PRECIPITATION)]
concept_60=75-75; [RED]
concept_65=76-76; [SEE]
concept_66=77-77; [SEED]
concept_67=78-78; [SHORT]
concept_68=79-79; [SIT]
concept_69=80-80; [SKIN]
concept_70=81-81; [SLEEP]
concept_71=82-82; [SMALL]
concept_72=83-83; [SMOKE (EXHAUST)]
concept_73=84-84; [SNAKE]
concept_74=85-85; [STAND]
concept_76=86-86; [STONE]
concept_82=87-87; [THOU]
concept_84=88-88; [TOOTH]
concept_85=89-89; [TREE]
concept_86=90-90; [TWO]
concept_87=91-91; [WALK]
concept_89=92-92; [WATER]
concept_90=93-93; [WHITE]
concept_92=94-94; [WIND]
concept_93=95-95; [WOMAN]
concept_94=96-96; [WORM]
concept_1=97-103; [ALL]
concept_2=104-111; [ASH]
concept_3=112-120; [BARK]
concept_4=121-128; [BELLY]
concept_5=129-137; [BIG]
concept_6=138-146; [BIRD]
concept_7=147-154; [BITE]
concept_8=155-162; [BLACK]
concept_9=163-168; [BLOOD]
concept_10=169-174; [BONE]
concept_11=175-183; [BREAST]
concept_12=184-193; [CLOUD]
concept_13=194-201; [COLD]
concept_14=202-209; [COME]
concept_15=210-214; [DIE]
concept_16=215-218; [DRY]
concept_17=219-223; [EAR]
concept_18=224-229; [EARTH (SOIL)]
concept_19=230-239; [EAT]
concept_20=240-247; [EGG]
concept_21=248-253; [EYE]
concept_22=254-262; [FAR]
concept_23=263-266; [FEATHER]
concept_24=267-274; [FIRE]
concept_25=275-281; [FISH]
concept_26=282-288; [FLY (MOVE THROUGH AIR)]
concept_27=289-295; [FULL]
concept_28=296-301; [GIVE]
concept_29=302-309; [GOOD]
concept_30=310-315; [GREEN]
concept_31=316-323; [HAIR]
concept_32=324-329; [HEAD]
concept_33=330-337; [HEAR]
concept_34=338-341; [HEART]
concept_35=342-351; [HEAVY]
concept_36=352-356; [HORN (ANATOMY)]
concept_37=357-364; [I]
concept_38=365-375; [KILL]
concept_39=376-381; [KNEE]
concept_40=382-388; [LEAF]
concept_41=389-394; [LIE (REST)]
concept_42=395-400; [LIVER]
concept_43=401-409; [LONG]
concept_44=410-415; [LOUSE]
concept_45=416-421; [MAN]
concept_46=422-428; [MANY]
concept_47=429-434; [MOON]
concept_48=435-441; [MOUNTAIN]
concept_49=442-448; [MOUTH]
concept_50=449-451; [NAME]
concept_51=452-460; [NEAR]
concept_52=461-468; [NECK]
concept_53=469-470; [NEW]
concept_54=471-474; [NIGHT]
concept_55=475-478; [NOSE]
concept_56=479-481; [NOT]
concept_57=482-485; [ONE]
concept_58=486-493; [PERSON]
concept_59=494-501; [RAIN (PRECIPITATION)]
concept_60=502-508; [RED]
concept_61=509-518; [ROAD]
concept_62=519-525; [ROOT]
concept_63=526-532; [SAND]
concept_64=533-540; [SAY]
concept_65=541-546; [SEE]
concept_66=547-551; [SEED]
concept_67=552-556; [SHORT]
concept_68=557-561; [SIT]
concept_69=562-568; [SKIN]
concept_70=569-574; [SLEEP]
concept_71=575-582; [SMALL]
concept_72=583-587; [SMOKE (EXHAUST)]
concept_73=588-598; [SNAKE]
concept_74=599-601; [STAND]
concept_75=602-605; [STAR]
concept_76=606-610; [STONE]
concept_77=611-614; [SUN]
concept_78=615-620; [SWIM]
concept_79=621-631; [TAIL]
concept_80=632-640; [THAT]
concept_81=641-644; [THIS]
concept_82=645-647; [THOU]
concept_83=648-652; [TONGUE]
concept_84=653-655; [TOOTH]
concept_85=656-662; [TREE]
concept_86=663-666; [TWO]
concept_87=667-676; [WALK]
concept_88=677-684; [WARM]
concept_89=685-690; [WATER]
concept_90=691-698; [WHITE]
concept_91=699-704; [WHO]
concept_92=705-708; [WIND]
concept_93=709-716; [WOMAN]
concept_94=717-722; [WORM]
concept_95=723-729; [YELLOW]
END; [CHARACTERS]
BEGIN DATA;
DIMENSIONS NTAX=20 NCHAR=729;
FORMAT DATATYPE=STANDARD SYMBOLS="10" GAP=- MISSING=? INTERLEAVE=yes;
MATRIX
armenian ??0010?00000?000010?0000000000000000000000000?0000000000?0000000000000000000000000?0000000?00?0????????00001000000000000????????00010000000001000000010000001000000001000001000000100000000100000????????0000100000100????001000000101000000000000001000010000001000001000000010000000100001000000010001000000000010000010000001000000100000010001000??????????0010010000000000010000000010000000100000100100000010000000000010??????00010000001000100000000010001000010000000000100010100001000100100001000000010000001000000000100000000010001000000010000000100100001000001000000100000001001000000?????0000001000001010000001000010000000001000000001000000010001000001000100010000010??????????0010000010000000001000??????????10000000??????0000100
bulgarian 110011110011000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000001000000000000000000001000000010000100000000000100010000000010000001000000100000100000000000010000000001001000000000000010000000000010000100000010000000100000001000100000001000000100000001000000010001000000001000000001000100000000010010000100000000010000001000000000100000010000010000010000000100000100000001000000000001000001000000000010010000000001000000000100001101000100010001000100000001000000010000000010000000100000010000000100000100000100001100010000000000000100000100000010000010000000010001000010001000100000000001000000000100010001000100001000100000100000100000000000000100000010000000000000100010000000100000100000
czech 101011110010000000000000000000000000000000000000000000000000000000000000000000000000000000000?0000100000100000000000000000100000001000000010000000001000001000000001000000100010000000010000000000011000000100000100000000000001000000100000000010000001000000010000000100010000000100000010000001000001000000100000000100001000000010000010000001000010000000001000100000000010000000001000001000001000000010000001000000100000000000100000001000010000101000000010100000000000001010100010001000100010000000100000001000000000001000010000001000000000100010000010000010001000000000000010000010000001000000000010001000100001000100010000000000010000010000000000100010000101000000010000001000000000000010000001000000010000????010000000100000100000
danish 001100001000100100000000000000000000000000000000000000000000000000000000000000000000000000000000100000010000000100000000000000100100000001000000001000000000100000100000100000100000000010000000010000000100000000100010000001001000000100000001000000001000011000000010000000000010000001000000100000000001010000000100000100000001000001000000000101010000000100001000000001000000000100000100000010000010000010000000010000010000000000000100000010000100000001010000000010000000011000100001010001000000010000000100000000000001001000000100000010000000010000010001000010000100000010000010000000100000100000000010000101000010001000001000000000001000000000001001000010010000001000100000000010000000010000100000001000001000100000001000001000000
dutch 000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000010000000001000000100000001000000001000000001000000001000000100000100000100000000000100000010000000100000000001001000100000100010000000000001000010000010000000010000010000010000001000000100000001000010000000100000100000001000001000000010001000000000100000100000010000000000100000100000010000010000010000000010000010000000100000100000001000100000010010000001010010000101000100010010001000000010000000100000010000000000010000100000010000000010000001001000010000110000000010000001000100000100000000010010001000001001000000100000000000000000010000101000010000001000100100000000010000000100000100000001000001000000100001000011000000
english 000000001000010000100000000000000000000000000000000000000000000000000000000000000000000000000000100000010000000100000000000100001000001000000010001000000000001000100000100000100000000000001000010000000100000000100001000100000100010000000001000000000010010000000010000010000010000001000000100000000001010000000100000100000001000001000000010000000010000100000000100000000010000100000000000000000010000010000000010000010000010000000100001000000000100010010000000100010000011000100010010001000000010000000100000010010000001000000100000010000000010000001001000010000001000000010010010000000100000000010010010001000001001000000000000010000010000010000101000010010000001000100000000010000000100000100000000001001000001000001000000010000
french 110000?00000001010000000000000000000000000000000000000000000000000000000000000000000?000000000000100010000001000001000000100000010000000000000010001000000000001000000100100000100010000000001000010000000100000010000000010000000100000100000000001000010000000010000000110000000001000000000000010000001000001000000100000000001000100000100000010000000010001000000100000000000010000001000001000010000000100100000000001000??????000010010000010000000010000100001000000000010001000101000100000100100000001000000001000000000001000010000001000010000000010001000110100001000000100010000000001000010010000000000???00010100010000010000000000000000001000010001000010010001000001000001000000100010000000100010000000000000100001000001000000000010
german 000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000010000000010000100000001000000001000000001000000001000000100000000010100000000000100000010000000100000000001001000100000100000010000000001000010000010000000010000010000010000001000000100000001000010000000100000100000000000001000000010001000000000100000100000010000000000100000100000010000010000010000000010000010000000100000100000001000100000010010000000010000000011000100010010001000000010000000100000010000000000010000100000010000000010000100001000010000100000000010000001000100000100000000010010001000001001000000000100000000000010010001001000010000001000100100000000010000000100000100000000010001000000100011000001000000
greek 000000000000000000?111111000000000000000000000000000000000000000000?00?00000000000000000000000000000000000000100000010000001000000000001010000000000000100000000100000010000010000001000010000000000001001000000000001000100001000000000000010000000010000000000001000010000000100000001000010000000100001000000100000000100000000100001000000000010000001000000001001000000000000001000001000010000??????010000000010000001000001000000001010000000001000100000100000010000????????0110000001???10000000100000001000000010000000000010001001001000000000000000010000100000100010000001000000100000010000010001000000010010000100000000000100000000001000000001001000010001010000000100001000001000000001000000010000001000000100010010000000001000000001
hindi 110000000000000000000000011111110000000000?000000000000000000000000000000?000000000000000000??00000010100000001000000100000010000000100000000000100000001000000001??????0000000000000100000000100000000100100000000100000000100001000000000001000000001100000000000100001000000010000000100001000000000001000000000100000010001000000000100000010000100001000000000100000100000000000100100000000010000010000010000001000000000000100000100000001001000000001000100001000000010000000100001000100001000000100????????000001001000000000000000000001000000101000100001000001000001000000110000000000100010000000000001000100000100000100000010001000000000000000100101000000110000010001000000000100010000000010000????????000000????000001000000100010000
icelandic 000000100000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000100010000000100000000000000010101000001000000001000000001000000100000100000100000000010000000010000000100000000100010001000001000000000000011000000010000001000000010000001000010000001000000000001001000000000001100000100000001000001000000010000000000100100001000000001000000000100000100000010000010000010000000010000000000010000000100010010000100000001010000000010000000011000100001010000000001010000000100000010000000001000000100000010000000000001010000001010000101000010000010010000100000000000000110000101000010000001001100000000000011000010001001000010010000010100000100000000000100100000000000100010001000100000001010001000000
italian 110000?000000010000000000000000010000000000000000000000000000000000000000000000000000000000000000101000001000000100000000100000010000000000000000101000000000100000010000100000110000000010000000000010000100000010000001010000000100000010000000100000100000010000000001110000000001000000000000100000100000000010000100000010000000100000100000010000010000001000000000010100000000000010000010000001000010000100000000000100??????10000001000001000000001000010000100000001000000101000100010010000010000000000100100000000101000000000100000100001000000100000100001010010000010000001000001000000001001000000000010010000000110000010000010000000010000000001001000010010001000001000011000000000000010001000001000000000001000010000001000000000000
norwegian 00110000100011010000000000000??00000?0000000000000000000000000000000000000?000000000000000000000100000010000000100000000100000000100000001000000001000000000100000100000100000100000000010000000010000000100000000100010001000001000000100000001000000001000000000001010000000000110000001000000100000000000110000000100000100000001000001000000000100010000000100001000000001000000000100000100000010000010000010000000010000010000010000000100000010000100000001010000000010000000011000100001010001000000010000000???????????????????????????????10000000010000010001000010000100000010000010000000100000100100000010000101000010001000001000000000000000000000001001000010010000001000100000000010000000010000100000001000001000100000000010001000000
polish 101011010001000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000001000000000000000100000000010000010100000000010000010000000010000001000100000000100000000000100001000100101000000000000010000001000000000100000010000000100000001000100000001000000100000010000000010001000000001000010000000100000100000010000100000000010001000000000100000000010000010000010000000100000000010001000000001000100010010000000100010000000001000000000100000101000100010001000100000001000000010000001000000000100000010000000000010100000100000100010000000000000100000000001010000011000000010001000010001000100000000000000100100000000001000100001000100000100000000010000000001100000000000010100000100000000100100000100000
portuguese 110001000100000000000000000000000110000000000000000000000000000000000000000000000000?000000000000100000000100000000000100110000010000000000110000001000000000000000010000100000110000000010000000010000000000000110000001010000000100000001000000100000000010000000000001010000000001000000000100000001100000001000000100000010000000000000010000010000010000001000000000001000100000000000100010000001000010000100000000001000010000100000010000010000000010000100001000000010000001010000100100000100100000000000100010000001010000010000000000100010000000010001000010100100000100000010000010000000010010000100000???10000100010000010000010010000010000000001001000010010001000001000010000000001000000001000000100000000000100000010001000000001000
Proto 111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
romanian 100100000100000000000000000000000001110000000000000000000000000000000000000000000000000000000000010100000100000000000001010000000000100000010000000000000100010000001000010000010000001000000001001001000010000001000000101000000000100001000000000100010000000000000100101000000000010000000001001000010000000100000010000000100000000000000000100000000000010100001000000010000000000001000000100001000001000010000000000010001000110000001000001000000000001010000000100000001000100001100010010000010000000100000100000000000000000000100000000110000000100000100001000010000010000001000000000010001001000000000010010000100000100001000010000000000000000000011000000010001000001000000000001001000000000001000100000000011000001000001000001000000
russian 101111110011000000000000000000000000001000000000000000000000000000000000000000000000000000000000001000001000000000000000000010000010000000100000000010000010000000010000001000000100000100000000000100000001100001000000000000010000001000000000100000000000100100000001000100000001000000100000010000010000000010000001000010000000100000100000010000000000001010001000000000000000001010000010000010000000000100000001001000000000001000001000000000100000000100001000000000100000101000100010001000100000001000000000000101000000000100000010000000100000100000100000100010000000000000100000100000010000010000000010001000010001000100000000100000000000000000001000100001000100000100010000000000000000100000010000000100000100010000000100000100000
spanish 110000?00100001010000000000000000000000000000000000000000000000000000000000000000000?000000000000101000001000000100000000100000010000000000110000001000000000100000010000100000110000000010000000010000000100000010000000010000000100000001000000100000100000010000000000110000000000100000000000001000100000000010000100000010000000000000010000010000010000001000000100000000100000000000010000001000001010000000000001000001??????100000010000010000000010000100000000100010000001010000100100100000000001000000011000000001000000010000000000100010000000010001000010100100000100000010000010000000010010000100000???10000100010000010000000010000010000000001001000010010001000001000011000000001000000001000001000000000000001000010001000000001000
swedish 001100001000100000000000000000000000000000000000000000000000000000000000000000000000000000000000100000010000000100000000100000000100000001000000001000000001000000100000100000100000000010000000110000000100000000100010001000001000010000000001000000010000011000000010000001000010000001000000100000000000110000000100000100000001000001000000000101010000000100001000000011000000000100000100000010000010000010000000010000010000010000000100000001000100000010010000000010000000011000100001010001000000010000000100000010000000001000000100000010000000010000010001000010000101000010000010010000100000000100000010000101000010001000000100000000000000000000001001000010010000000100100000000010000000100000100000001000001000100000000010001000000
;
END;
"""
Main code accompanying the study "Converting an Artificial Proto-Language into Data for Testing Computational Approaches in Historical Linguistics" by J.-M. List (2024).
"""
from urllib.request import urlopen
import tempfile
import pathlib
from pypdf import PdfReader
from lingpy import ipa2tokens, Wordlist, LexStat
from pysem import to_concepticon
url = "https://gitlab.com/protolanguage1/protolanguage-supplement-lexicon/-/raw/main/SUPPLEMENTARY_MATERIALS.pdf?inline=false"
pages = {}
with tempfile.TemporaryDirectory() as t:
with urlopen(url) as req:
data = req.read()
path = pathlib.Path(t) / "data.pdf"
with open(path, "wb") as f:
f.write(data)
pdf = PdfReader(path)
for i, page in enumerate(pdf.pages):
pages[i] = page.extract_text()
data = []
for i in range(3):
rows = [row for row in pages[i].split("\n") if
"=" in row]
for row in rows:
word, concepts = row.strip().split("=")
for concept in concepts.strip().split(", "):
mappings = to_concepticon(
[{"gloss": concept}]
)[concept]
value = word.strip()
form = value.replace("γ", "ɣ")
tokens = ipa2tokens(form)
if mappings:
data += [[
"Proto",
concept,
mappings[0][0],
mappings[0][1],
value,
form,
tokens
]]
id2gl = {row[2]: row[3] for row in data}
pie = Wordlist.from_cldf("starostinpie/cldf/cldf-metadata.json")
overlap = set([pie[idx, "concepticon"] for idx in pie])
wln = {0: [
"doculect",
"concept",
"value",
"form",
"tokens",
]}
count = 1
for row in data:
if row[2] in overlap:
wln[count] = [
"Proto",
row[3],
row[4],
row[5],
row[6]]
count += 1
for idx in pie:
if pie[idx, "concepticon"] in id2gl:
concept = id2gl[pie[idx, "concepticon"]]
wln[count] = [
pie[idx, "doculect"],
concept,
pie[idx, "value"],
pie[idx, "form"],
pie[idx, "tokens"]
]
count += 1
lex = LexStat(wln)
lex.cluster(method="sca", ref="cogid", threshold=0.45, cluster_method="upgma")
lex.calculate("tree", tree_calc="upgma")
print(lex.tree.asciiArt())
lex.output("tsv", filename="wordlist", ignore="all", prettify=False,
subset=True,
cols=["doculect", "concept", "value", "form", "tokens", "cogid"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment