Last active
January 8, 2022 14:08
-
-
Save xflr6/f3a13047e371c29caf60 to your computer and use it in GitHub Desktop.
Dump basic https://glottolog.org languoid info into CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Dump basic https://glottolog.org languoid info to CSV file.""" | |
import pandas as pd | |
ENGINE = 'postgresql://postgres@/glottolog3' | |
QUERY = ''' | |
SELECT | |
l.id AS glottocode, | |
l.name, | |
p.id AS parent_id, | |
f.id AS family_id, | |
ll.level, | |
ll.category, | |
l.latitude, | |
l.longitude, | |
( | |
SELECT STRING_AGG(i.name, ', ' ORDER BY i.name) | |
FROM languageidentifier AS li | |
JOIN identifier AS i ON i.pk = li.identifier_pk AND i.type = 'iso639-3' | |
WHERE li.language_pk = l.pk | |
) AS iso_code, | |
( | |
SELECT STRING_AGG(SUBSTRING(link->>'url' FROM '_([a-z]+)$'), ', ') | |
FROM jsonb_array_elements(l.jsondata::jsonb->'links') AS link | |
WHERE link->>'url' ~ '^https://wals.info/languoid/lect/wals_code_[a-z]+$' | |
) AS wals_codes, | |
ll.macroareas, | |
( | |
SELECT STRING_AGG(de.name, ', ' ORDER BY de.name) | |
FROM parameter AS p | |
JOIN valueset AS vs ON vs.parameter_pk = p.pk AND vs.language_pk = l.pk | |
JOIN value AS v ON v.valueset_pk = vs.pk | |
JOIN domainelement AS de ON de.pk = v.domainelement_pk | |
WHERE p.id = 'country' | |
) AS countries, | |
( | |
SELECT de.name | |
FROM parameter AS p | |
JOIN valueset AS vs ON vs.parameter_pk = p.pk AND vs.language_pk = l.pk | |
JOIN value AS v ON v.valueset_pk = vs.pk | |
JOIN domainelement AS de ON de.pk = v.domainelement_pk | |
WHERE p.id = 'aes' | |
) AS aes | |
FROM language AS l | |
JOIN languoid AS ll USING (pk) | |
LEFT JOIN language AS p ON ll.father_pk = p.pk | |
LEFT JOIN language AS f ON ll.family_pk = f.pk | |
WHERE l.active | |
ORDER BY l.id | |
'''.strip() | |
df = pd.read_sql_query(QUERY, ENGINE, index_col='glottocode') | |
df.info(memory_usage='deep') | |
assert df.index.is_unique | |
assert df.index.is_monotonic_increasing | |
df.to_csv('languoids.csv', encoding='utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment