Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active January 8, 2022 14:08
Show Gist options
  • Save xflr6/f3a13047e371c29caf60 to your computer and use it in GitHub Desktop.
Save xflr6/f3a13047e371c29caf60 to your computer and use it in GitHub Desktop.
Dump basic https://glottolog.org languoid info into CSV file
"""Dump basic https://glottolog.org languoid info to CSV file."""
import pandas as pd
ENGINE = 'postgresql://postgres@/glottolog3'
QUERY = '''
SELECT
l.id AS glottocode,
l.name,
p.id AS parent_id,
f.id AS family_id,
ll.level,
ll.category,
l.latitude,
l.longitude,
(
SELECT STRING_AGG(i.name, ', ' ORDER BY i.name)
FROM languageidentifier AS li
JOIN identifier AS i ON i.pk = li.identifier_pk AND i.type = 'iso639-3'
WHERE li.language_pk = l.pk
) AS iso_code,
(
SELECT STRING_AGG(SUBSTRING(link->>'url' FROM '_([a-z]+)$'), ', ')
FROM jsonb_array_elements(l.jsondata::jsonb->'links') AS link
WHERE link->>'url' ~ '^https://wals.info/languoid/lect/wals_code_[a-z]+$'
) AS wals_codes,
ll.macroareas,
(
SELECT STRING_AGG(de.name, ', ' ORDER BY de.name)
FROM parameter AS p
JOIN valueset AS vs ON vs.parameter_pk = p.pk AND vs.language_pk = l.pk
JOIN value AS v ON v.valueset_pk = vs.pk
JOIN domainelement AS de ON de.pk = v.domainelement_pk
WHERE p.id = 'country'
) AS countries,
(
SELECT de.name
FROM parameter AS p
JOIN valueset AS vs ON vs.parameter_pk = p.pk AND vs.language_pk = l.pk
JOIN value AS v ON v.valueset_pk = vs.pk
JOIN domainelement AS de ON de.pk = v.domainelement_pk
WHERE p.id = 'aes'
) AS aes
FROM language AS l
JOIN languoid AS ll USING (pk)
LEFT JOIN language AS p ON ll.father_pk = p.pk
LEFT JOIN language AS f ON ll.family_pk = f.pk
WHERE l.active
ORDER BY l.id
'''.strip()
df = pd.read_sql_query(QUERY, ENGINE, index_col='glottocode')
df.info(memory_usage='deep')
assert df.index.is_unique
assert df.index.is_monotonic_increasing
df.to_csv('languoids.csv', encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment