Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active January 8, 2022 14:06
Show Gist options
  • Save xflr6/bb6cd290fe0e73097b60 to your computer and use it in GitHub Desktop.
Save xflr6/bb6cd290fe0e73097b60 to your computer and use it in GitHub Desktop.
Download and combine https://glottolog.org/glottolog/language.csv parts using pandas
"""Combine https://glottolog.org/langdoc.csv parts."""
import urllib.parse
import pandas as pd
ENDPOINT = urllib.parse.urlparse('https://glottolog.org/langdoc.csv')
QUERY = {'sEcho': 1,
'iSortingCols': 1,
'iSortCol_0': 1, 'sSortDir_0': 'asc', # sort by name
'iDisplayStart': None, 'iDisplayLength': None}
N = 1_000
ENCODING = 'utf-8'
df = []
for offset in range(0, 348_000, N):
query = QUERY.copy()
query.update(iDisplayStart=offset, iDisplayLength=N)
url = ENDPOINT._replace(query=urllib.parse.urlencode(query)).geturl()
print(url)
df.append(pd.read_csv(url, encoding=ENCODING, index_col='id'))
df = pd.concat(df)
df.info(memory_usage='deep')
assert df.index.is_unique
assert not df.index.is_monotonic_increasing
df.to_csv('langdoc.csv', encoding=ENCODING)
assert 1_000 < df['name'].duplicated().sum() < 100_000
"""Combine https://glottolog.org/glottolog/language.csv parts."""
import urllib.parse
import pandas as pd
ENDPOINT = urllib.parse.urlparse('https://glottolog.org/glottolog/language.csv')
QUERY = {'sEcho': 1,
'iSortingCols': 1,
'iSortCol_0': 0, 'sSortDir_0': 'asc', # sort by id (glottocode)
'iDisplayStart': None, 'iDisplayLength': None}
N = 1_000
ENCODING = 'utf-8'
df = []
for offset in range(0, 9_000, N):
query = QUERY.copy()
query.update(iDisplayStart=offset, iDisplayLength=N)
url = ENDPOINT._replace(query=urllib.parse.urlencode(query)).geturl()
print(url)
df.append(pd.read_csv(url, encoding=ENCODING, index_col='id'))
df = pd.concat(df)
df.info(memory_usage='deep')
assert df.index.is_unique
assert df.index.is_monotonic_increasing
df.to_csv('language.csv', encoding=ENCODING)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment