Skip to content

Instantly share code, notes, and snippets.

@LinguList
Last active July 15, 2022 04:36
Show Gist options
  • Save LinguList/35adb68afa1500bfbdecdad1630307bd to your computer and use it in GitHub Desktop.
Save LinguList/35adb68afa1500bfbdecdad1630307bd to your computer and use it in GitHub Desktop.
How to Compute Colexification Networks with CL Toolkit

How to Compute Colexification Networks with CL Toolkit (Supplementary Material)

This code example accompanies the Blog Post "How to Compute Colexification Networks with CL Toolkit (How to do X in Linguistics 11)" by List (2022).

You need to install the python-igraph package, networkx, lingpy, and cltoolkit:

$ pip install python-igraph
$ pip install networkx
$ pip install lingpy
$ pip install cltoolkit

For the visualization, you need Cytoscape.

As dataset, we use the Intercontinental Dictionary Series, which you can either download from GitHub or directly clone with git:

$ git clone https://github.com/intercontinental-dictionary-series/ids

To run the code, just type:

$ python colexification.py
"""
Compute colexifications for dedicated wordlists.
"""
from cltoolkit import Wordlist
from pycldf import Dataset
from collections import defaultdict
from itertools import combinations
from lingpy.convert.graph import networkx2igraph
import networkx as nx
import html
def get_colexifications(language, data):
"""
Compute colexifications and add them to the data dictionary.
"""
tmp = defaultdict(list)
for form in language.forms:
if form.concept:
tmp[form.form] += [(form.concept.concepticon_gloss, form)]
for forms, colset in tmp.items():
concepts = set([f[0] for f in colset])
for (cA, fA), (cB, fB) in combinations(colset, r=2):
if cA != cB:
data[cA, cB][language.name] += [(fA, fB)]
wl = Wordlist([
Dataset.from_metadata("ids/cldf/cldf-metadata.json")
])
cols = defaultdict(lambda : defaultdict(list))
for language in wl.languages:
if language.family and language.glottocode and language.latitude:
print("[i] analyzing language {0}".format(language.name))
get_colexifications(language, cols)
# get all concepts involved in colexifications
G = nx.Graph()
for (nA, nB), data in cols.items():
if nA not in G.nodes:
G.add_node(
nA,
concept=nA,
frequency=1,
)
else:
G.nodes[nA]["frequency"] += 1
if nB not in G.nodes:
G.add_node(
nB,
concept=nB,
frequency=1,
)
else:
G.nodes[nB]["frequency"] += 1
languages = list(data)
words, families = [], []
for item in data.values():
words += [item[0][0].id]
families += [item[0][0].language.family]
G.add_edge(
nA,
nB,
languages=";".join(languages),
families=";".join(families),
weight=len(set(families)),
words="/".join(words),
)
IG = networkx2igraph(G)
for i, comm in enumerate(
IG.community_infomap(
vertex_weights="frequency",
edge_weights="weight")):
for node in comm:
G.nodes[IG.vs[node]["Name"]]["infomap"] = str(i+1)
with open("graph.gml", "w") as f:
for line in nx.generate_gml(G):
f.write(html.unescape(line)+"\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment