Skip to content

Instantly share code, notes, and snippets.

@hrishikeshrt
Created April 24, 2021 16:52
Show Gist options
  • Save hrishikeshrt/14297fc93b612050915bb93942c64460 to your computer and use it in GitHub Desktop.
Save hrishikeshrt/14297fc93b612050915bb93942c64460 to your computer and use it in GitHub Desktop.
Download stardict dictionaries from indic-dict
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 19:25:34 2021
@author: Hrishikesh Terdalkar
Original: https://github.com/sanskrit-coders/pydictupdater
"""
###############################################################################
import os
import tarfile
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
###############################################################################
INDEX_BASE = "https://raw.githubusercontent.com/indic-dict/"
INDEX_SUFFIX = "tars/tars.MD"
INDEX_LIST = {
"sanskrit": [
"stardict-sanskrit/gh-pages/sa-head/sa-entries/",
"stardict-sanskrit/gh-pages/sa-head/en-entries/",
"stardict-sanskrit/gh-pages/en-head/",
"stardict-sanskrit-vyAkaraNa/gh-pages/",
"stardict-sanskrit-kAvya/gh-pages/",
],
"marathi": [
"stardict-marathi/gh-pages/ma-head/ma-entries/",
"stardict-marathi/gh-pages/ma-head/other-entries/"
],
"bengali": [
"stardict-bengali/gh-pages/bn-head/bn-entries/",
"stardict-bengali/gh-pages/bn-head/en-entries/"
],
"hindi": [
"stardict-hindi/gh-pages/hi-head/hi-entries/",
"stardict-hindi/gh-pages/hi-head/en-entries/",
"stardict-hindi/gh-pages/en-head/",
],
"english": [
"stardict-english/gh-pages/en-head/en-entries/",
"stardict-english/gh-pages/en-head/other-entries/",
"stardict-english/gh-pages/other-head/"
]
}
###############################################################################
def vprint(*args, **kwargs):
pass
###############################################################################
# download the url into dir
# if dir does not exist create it
def download_file(url, dir, force_download=True):
os.makedirs(dir, exist_ok=True)
# Open the url
try:
f = urlopen(url)
localpath = os.path.join(dir, os.path.basename(url))
# Open our local file for writing
if not force_download:
if os.path.isfile(localpath): # check if this file exists
print("Skipped '{localpath}' as it already exists")
return
with open(localpath, "wb") as local_file:
local_file.write(f.read())
# handle errors
except HTTPError as e:
print("HTTP Error:", e.code, url)
except URLError as e:
print("URL Error:", e.reason, url)
# take an index_url and return list of .tar.gz listed in it
def get_list_of_download_files(index_url):
encoding = 'utf-8'
returnlist = []
vprint(f"Processing index '{index_url}' ...")
# download this index and go through it line by line
response = urlopen(index_url)
for line in response:
line = line.rstrip() # remove line marker
# dict_url is a URL to a .tar.gz file
dict_url = line.decode(encoding)
returnlist.append(dict_url)
return returnlist
def download_and_extract_dictionary(dict_url, download_dir, extract_dir,
force_download=False):
dictfilename = os.path.basename(dict_url)
vprint(f"Downloading '{dictfilename}', to '{download_dir}' ...")
download_file(dict_url, download_dir, force_download)
# assert(dictfilename[-7:] == ".tar.gz", dictfilename)
t = tarfile.open(os.path.join(download_dir, dictfilename), 'r')
# thedictfilenamelen = len(dictfilename)
# Handle filenames like: kRdanta-rUpa-mAlA__2016-02-20_23-22-27
sub_dirname_to_extract = dictfilename[:-8].split("__")[0]
full_path_of_subdir = os.path.join(extract_dir, sub_dirname_to_extract)
vprint(f"Extracting to '{full_path_of_subdir}' ...")
t.extractall(full_path_of_subdir)
def download_dictionaries(index_base, index_list, index_suffix, language,
tgz_download_dir, dict_extract_dir,
maxcount=1, force_download=False):
count = 0
for index_url in index_list[language]:
full_index_path = index_base + index_url + index_suffix
# download this index
vprint("============================================")
vprint(f"Downloading index '{full_index_path}'.")
dictlist = get_list_of_download_files(full_index_path)
for adict in dictlist:
download_and_extract_dictionary(adict, tgz_download_dir,
dict_extract_dir, force_download)
count += 1
if count == -1:
continue # no limit to download
if count == maxcount:
return
vprint("============================================")
def get_master_list_to_download(base, index_list):
masterlist = []
for language_urls in index_list.values():
for index_url in language_urls:
full_index_path = base + index_url
# download this index
vprint("============================================")
vprint("Fetching index '{full_index_path}' ...")
dictlist = get_list_of_download_files(full_index_path)
masterlist.extend(dictlist)
vprint("============================================")
return masterlist
###############################################################################
if __name__ == '__main__':
import argparse
import tempfile
home_dir = os.path.expanduser('~')
class Config:
download = os.path.join(tempfile.gettempdir(), "dictdata")
install = os.path.join(home_dir, "dictdata")
languages = ["sanskrit", "english"]
verbose = False
force = False
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--download", help="Download Location")
parser.add_argument("-i", "--install", help="Install Location")
parser.add_argument("-l", "--languages", nargs="+", help="Languages")
parser.add_argument("-v", "--verbose", action='store_true',
help="Verbose")
parser.add_argument("-f", "--force", action='store_true',
help="Force Download")
args = parser.parse_args(namespace=Config())
# ----------------------------------------------------------------------- #
install_dir = args.install
download_dir = args.download
languages = args.languages
force_download = args.force
verbose = args.verbose
# ----------------------------------------------------------------------- #
vprint = print if verbose else vprint
for language in languages:
if language not in INDEX_LIST:
vprint(f"Error: Language '{language}' not found.")
continue
vprint("============================================")
vprint(f"Language: {language.title()}")
download_dictionaries(INDEX_BASE, INDEX_LIST, INDEX_SUFFIX, language,
download_dir, install_dir, maxcount=-1,
force_download=force_download)
vprint("============================================")
###############################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment