Skip to content

Instantly share code, notes, and snippets.

@derlin
Last active December 2, 2022 22:39
Show Gist options
  • Save derlin/917a64e6412de6c503f3f52e0878f919 to your computer and use it in GitHub Desktop.
Save derlin/917a64e6412de6c503f3f52e0878f919 to your computer and use it in GitHub Desktop.
Python scripts for downloading Leipzig Corpora Languages
import tarfile
from html.parser import HTMLParser
from io import BytesIO
import requests
import os
class LeipzigResourceFinder(HTMLParser):
"""Find available leipzig resources for a given language."""
def __init__(self, code, variant=None):
"""
Fetch all resources for a given language. Example:
.. code::
lcr = LeipzigResourceFinder(code='fra')
lcr.data[-1] # 'fra_mixed_2009_10K.tar.gz'
lcr.data[-1] # 'fra_wikipedia_2010_1M.tar.gz'
lcr.find_one(size='10K', type='news', year=2009) # 'fra_news_2009_10K.tar.gz'
lcr.find_one(size='10K', type='news', year=2014) # 'fra_news_2009_10K.tar.gz'
:param code: the language code, e.g. 'eng' or 'ban'
:param variant: an optional language variant, e.g. 'ca' for 'eng-ca', 'eu' for 'eng-eu', etc.
"""
super().__init__()
# store
self.code = code
self.variant = variant
# list of all resources, index of resources
self.data = []
self.index = {k: {} for k in ['type', 'year', 'size']}
# to the deed
res = requests.post(
'http://wortschatz.uni-leipzig.de/download/service',
data=dict(corpora_language=code, func='table'),
headers={'X-Requested-With': 'XMLHttpRequest'})
self.feed(res.text)
def find_one(self, size='10K', typ='wikipedia', year='2016'):
"""
Get one resource, giving priority first to size, then resource type and finally year.
If the exact match is not available, it will return something else in this order.
:param size: one of 10K, 30K, 100K, 1M, 3M
:param typ: one of news, news-typical, wikipedia, web, mixed
:param year: a year, e.g. 2016
:return: a resource in the form `<lang/variant>_<type>_<year>_<size>.tar.gz` or None (rare !)
"""
if len(self.data) == 0:
return None
# first, try to find a match
sets = [set(self.index[k].get(v, [])) for k, v in zip(['size', 'type', 'year'], (size, typ, year))]
while len(sets) > 0:
matches = set.intersection(*sets)
if len(matches) > 0:
return matches.pop()
sets.pop()
# return anything of the good size (might happen if only variants are available...
for i in range(1, len(self.data) + 1):
if size in self.data[-i]:
return self.data[-i]
# last resort: return anything
return self.data[-1]
def handle_starttag(self, tag, attrs):
# HTMLParser override
if tag == 'a':
attrs = dict(attrs)
cls = attrs.get('class', '')
if 'link_corpora_download' in cls and 'data-corpora-file' in attrs:
self._process_resource(attrs.get('data-corpora-file'))
def _process_resource(self, res):
self.data.append(res)
split = res.split('.')[0].split('_')
if len(split) != 4: # might happen on some rare occasions, such as Serbo-Croation hbs_ba_web_2015_30K
return
lang, typ, year, size = split
if self.variant is None:
if lang != self.code: return
else:
if lang != self.variant: return
for k, v in zip(['type', 'year', 'size'], [typ, year, size]):
if v not in self.index[k]:
self.index[k][v] = []
self.index[k][v].append(res)
class Leipzig:
DOWNLOAD_URL = 'http://pcai056.informatik.uni-leipzig.de/downloads/corpora'
_gist_meta_url = 'https://gist.githubusercontent.com/derlin/917a64e6412de6c503f3f52e0878f919/raw/leipzig_meta.json'
def __init__(self, version=None):
url = self._gist_meta_url
if version is not None:
url = url.replace('raw', f'raw/{version}')
try:
leipzig_meta = requests.get(url).json()
except:
raise Exception(f'Could not fetch meta from {url}.')
self.code2lang = leipzig_meta['languages']
self.lang2code = {v: k for k, v in self.code2lang.items()}
@staticmethod
def download_sentences(url):
"""Download sentences from a leipzig resource URL."""
# get tar archive
res = requests.get(url)
# extract sentences file from archive
tar = tarfile.open(mode='r:gz', fileobj=BytesIO(res.content))
tar_info = [member for member in tar.getmembers() if member.name.endswith('sentences.txt')][0]
handle = tar.extractfile(tar_info)
# read sentence file
raw_text = handle.read().decode('utf-8')
return [line.split('\t')[1] for line in raw_text.split('\n') if '\t' in line]
def download_all(self, download_folder, language_codes=None, filename='{code}.{size}.txt',
size='10K', typ='wikipedia', year='2016',
normalize_func=lambda t: t, filter_func=lambda t: True):
"""
Download all resources into a folder. For size, typ and year argument, see LeipzigResourceFinder
:param download_folder: the download folder
:param language_codes: a list of language codes to download, will download everything if not set
:param filename: available placeholders are code, size and typ
:param size: the size to prioritize
:param typ: the type of resource to prioritize, e.g. 'wikipedia', 'web', etc.
:param year: the year to prioritize
:param normalize_func: an optional function called on each sentence
:param filter_func: an optional filter to exclude sentences
"""
os.makedirs(download_folder, exist_ok=True)
if language_codes is None:
language_codes = list(self.code2lang.keys())
for code in language_codes:
variant = None
if '-' in code:
code, variant = code.split('-')
outpath = os.path.join(download_folder, filename.format(code=code, typ=typ, size=size))
if not os.path.exists(outpath):
try:
print(f'Processing {code} {self.code2lang[code]}...', end=' ', flush=True)
resource = LeipzigResourceFinder(code, variant).find_one(size, typ, year)
print(resource, end=' ', flush=True)
lines = [
normalize_func(l)
for l in self.download_sentences(f'{self.DOWNLOAD_URL}/{resource}')
if len(l) > 0 and not l.isspace() and filter_func(l)
]
if len(lines):
with open(outpath, 'w') as f:
f.write('\n'.join(lines))
print(f'{len(lines)} lines. OK')
else:
print('no line.')
except Exception as e:
print('ERROR', e)
{
"download_url":"http://pcai056.informatik.uni-leipzig.de/downloads/corpora",
"service_url":"http://wortschatz.uni-leipzig.de/download/service",
"sizes":[
"10K",
"30K",
"100K",
"300K",
"1M"
],
"languages":{
"afr":"Afrikaans",
"sqi":"Albanian",
"amh":"Amharic",
"ara":"Arabic",
"arg":"Aragonese",
"hye":"Armenian",
"asm":"Assamese",
"ast":"Asturian",
"aze":"Azerbaijani",
"ban":"Balinese",
"bjn":"Banjar",
"bak":"Bashkir",
"eus":"Basque",
"bar":"Bavarian",
"bel":"Belarusian",
"ben":"Bengali",
"bih":"Bihari languages",
"bik":"Bikol",
"bpy":"Bishnupriya",
"bos":"Bosnian",
"bre":"Breton",
"bul":"Bulgarian",
"bua":"Buriat",
"cat":"Catalan",
"ceb":"Cebuano",
"bcl":"Central Bikol",
"ckb":"Central Kurdish",
"che":"Chechen",
"zho":"Chinese",
"chv":"Chuvash",
"cos":"Corsican",
"hrv":"Croatian",
"ces":"Czech",
"dan":"Danish",
"div":"Dhivehi",
"diq":"Dimli",
"nld":"Dutch",
"mhr":"Eastern Mari",
"arz":"Egyptian Arabic",
"eml":"Emiliano-Romagnolo",
"eng":"English",
"epo":"Esperanto",
"est":"Estonian",
"ext":"Extremaduran",
"fao":"Faroese",
"hif":"Fiji Hindi",
"fin":"Finnish",
"fra":"French",
"glg":"Galician",
"lug":"Ganda",
"kat":"Georgian",
"deu":"German",
"glk":"Gilaki",
"gom":"Goan Konkani",
"ell":"Greek, Modern",
"grn":"Guarani",
"guj":"Gujarati",
"hat":"Haitian",
"heb":"Hebrew",
"hin":"Hindi",
"hun":"Hungarian",
"isl":"Icelandic",
"ido":"Ido",
"ilo":"Iloko",
"ind":"Indonesian",
"ina":"Interlingua",
"pes":"Iranian Persian",
"gle":"Irish",
"ita":"Italian",
"jpn":"Japanese",
"jav":"Javanese",
"kal":"Kalaallisut",
"kan":"Kannada",
"krc":"Karachay-Balkar",
"csb":"Kashubian",
"kaz":"Kazakh",
"kin":"Kinyarwanda",
"koi":"Kiowa",
"kir":"Kirghiz",
"kom":"Komi",
"knn":"Konkani",
"kor":"Korean",
"kur":"Kurdish",
"ksh":"Kölsch",
"lat":"Latin",
"lav":"Latvian",
"lim":"Limburgan",
"lit":"Lithuanian",
"lmo":"Lombard",
"nds":"Low German",
"dsb":"Lower Sorbian",
"lus":"Lushai",
"ltz":"Luxembourgish",
"mkd":"Macedonian",
"mad":"Madurese",
"mlg":"Malagasy",
"msa":"Malay",
"mal":"Malayalam",
"mlt":"Maltese",
"cmn":"Mandarin Chinese",
"glv":"Manx",
"mri":"Maori",
"mar":"Marathi",
"mzn":"Mazanderani",
"nan":"Min Nan Chinese",
"min":"Minangkabau",
"xmf":"Mingrelian",
"mwl":"Mirandese",
"mon":"Mongolian",
"nep":"Nepali",
"new":"Newari",
"azj":"North Azerbaijani",
"frr":"Northern Frisian",
"sme":"Northern Sami",
"uzn":"Northern Uzbek",
"nor":"Norwegian",
"nob":"Norwegian Bokmål",
"nno":"Norwegian Nynorsk",
"oci":"Occitan",
"ori":"Oriya",
"oss":"Ossetian",
"pam":"Pampanga",
"pan":"Panjabi",
"pap":"Papiamento",
"nso":"Pedi",
"fas":"Persian",
"pfl":"Pfaelzisch",
"pms":"Piemontese",
"plt":"Plateau Malagasy",
"pol":"Polish",
"por":"Portuguese",
"pus":"Pushto",
"que":"Quechua",
"ron":"Romanian",
"roh":"Romansh",
"rus":"Russian",
"rue":"Rusyn",
"sgs":"Samogitian",
"san":"Sanskrit",
"srd":"Sardinian",
"sco":"Scots",
"srp":"Serbian",
"hbs":"Serbo-Croatian",
"sna":"Shona",
"scn":"Sicilian",
"szl":"Silesian",
"snd":"Sindhi",
"sin":"Sinhala",
"slk":"Slovak",
"slv":"Slovenian",
"som":"Somali",
"sot":"Sotho, Southern",
"spa":"Spanish",
"ekk":"Standard Estonian",
"lvs":"Standard Latvian",
"sun":"Sundanese",
"swh":"Swahili",
"swa":"Swahili",
"swe":"Swedish",
"gsw":"Swiss German",
"tgl":"Tagalog",
"tgk":"Tajik",
"tam":"Tamil",
"tat":"Tatar",
"tel":"Telugu",
"tha":"Thai",
"als":"Tosk Albanian",
"tso":"Tsonga",
"tsn":"Tswana",
"tur":"Turkish",
"tuk":"Turkmen",
"tyv":"Tuvinian",
"udm":"Udmurt",
"uig":"Uighur",
"ukr":"Ukrainian",
"hsb":"Upper Sorbian",
"urd":"Urdu",
"uzb":"Uzbek",
"vec":"Venetian",
"vie":"Vietnamese",
"vls":"Vlaams",
"vol":"Volapük",
"vro":"Võro",
"wln":"Walloon",
"war":"Waray",
"cym":"Welsh",
"fry":"Western Frisian",
"mrj":"Western Mari",
"pnb":"Western Panjabi",
"xho":"Xhosa",
"sah":"Yakut",
"yid":"Yiddish",
"yor":"Yoruba",
"zea":"Zeeuws",
"zul":"Zulu"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment