hrishikeshrt/indic-stardict-downloader

## indic-stardict-downloader
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 19:25:34 2021

@author: Hrishikesh Terdalkar

Original: https://github.com/sanskrit-coders/pydictupdater
"""

###############################################################################

import os
import tarfile
from urllib.request import urlopen
from urllib.error import URLError, HTTPError

###############################################################################

INDEX_BASE = "https://raw.githubusercontent.com/indic-dict/"
INDEX_SUFFIX = "tars/tars.MD"

INDEX_LIST = {
    "sanskrit": [
        "stardict-sanskrit/gh-pages/sa-head/sa-entries/",
        "stardict-sanskrit/gh-pages/sa-head/en-entries/",
        "stardict-sanskrit/gh-pages/en-head/",
        "stardict-sanskrit-vyAkaraNa/gh-pages/",
        "stardict-sanskrit-kAvya/gh-pages/",
    ],
    "marathi": [
        "stardict-marathi/gh-pages/ma-head/ma-entries/",
        "stardict-marathi/gh-pages/ma-head/other-entries/"
    ],
    "bengali": [
        "stardict-bengali/gh-pages/bn-head/bn-entries/",
        "stardict-bengali/gh-pages/bn-head/en-entries/"
    ],
    "hindi": [
        "stardict-hindi/gh-pages/hi-head/hi-entries/",
        "stardict-hindi/gh-pages/hi-head/en-entries/",
        "stardict-hindi/gh-pages/en-head/",
    ],
    "english": [
        "stardict-english/gh-pages/en-head/en-entries/",
        "stardict-english/gh-pages/en-head/other-entries/",
        "stardict-english/gh-pages/other-head/"
    ]
}

###############################################################################


def vprint(*args, **kwargs):
    pass

###############################################################################
# download the url into dir
# if dir does not exist create it


def download_file(url, dir, force_download=True):
    os.makedirs(dir, exist_ok=True)
    # Open the url
    try:
        f = urlopen(url)
        localpath = os.path.join(dir, os.path.basename(url))
        # Open our local file for writing
        if not force_download:
            if os.path.isfile(localpath):  # check if this file exists
                print("Skipped '{localpath}' as it already exists")
                return
        with open(localpath, "wb") as local_file:
            local_file.write(f.read())

    # handle errors
    except HTTPError as e:
        print("HTTP Error:", e.code, url)
    except URLError as e:
        print("URL Error:", e.reason, url)


# take an index_url and return list of .tar.gz listed in it
def get_list_of_download_files(index_url):
    encoding = 'utf-8'
    returnlist = []

    vprint(f"Processing index '{index_url}' ...")
    # download this index and go through it line by line
    response = urlopen(index_url)
    for line in response:
        line = line.rstrip()  # remove line marker
        # dict_url is a URL to a .tar.gz file
        dict_url = line.decode(encoding)
        returnlist.append(dict_url)
    return returnlist


def download_and_extract_dictionary(dict_url, download_dir, extract_dir,
                                    force_download=False):
    dictfilename = os.path.basename(dict_url)
    vprint(f"Downloading '{dictfilename}', to '{download_dir}' ...")
    download_file(dict_url, download_dir, force_download)
    # assert(dictfilename[-7:] == ".tar.gz", dictfilename)
    t = tarfile.open(os.path.join(download_dir, dictfilename), 'r')

    # thedictfilenamelen = len(dictfilename)
    # Handle filenames like: kRdanta-rUpa-mAlA__2016-02-20_23-22-27
    sub_dirname_to_extract = dictfilename[:-8].split("__")[0]
    full_path_of_subdir = os.path.join(extract_dir, sub_dirname_to_extract)
    vprint(f"Extracting to '{full_path_of_subdir}' ...")
    t.extractall(full_path_of_subdir)


def download_dictionaries(index_base, index_list, index_suffix, language,
                          tgz_download_dir, dict_extract_dir,
                          maxcount=1, force_download=False):
    count = 0
    for index_url in index_list[language]:
        full_index_path = index_base + index_url + index_suffix
        # download this index
        vprint("============================================")
        vprint(f"Downloading index '{full_index_path}'.")

        dictlist = get_list_of_download_files(full_index_path)
        for adict in dictlist:
            download_and_extract_dictionary(adict, tgz_download_dir,
                                            dict_extract_dir, force_download)
            count += 1
            if count == -1:
                continue  # no limit to download
            if count == maxcount:
                return
        vprint("============================================")


def get_master_list_to_download(base, index_list):
    masterlist = []
    for language_urls in index_list.values():
        for index_url in language_urls:
            full_index_path = base + index_url
            # download this index
            vprint("============================================")
            vprint("Fetching index '{full_index_path}' ...")
            dictlist = get_list_of_download_files(full_index_path)
            masterlist.extend(dictlist)
            vprint("============================================")
    return masterlist

###############################################################################


if __name__ == '__main__':
    import argparse
    import tempfile

    home_dir = os.path.expanduser('~')

    class Config:
        download = os.path.join(tempfile.gettempdir(), "dictdata")
        install = os.path.join(home_dir, "dictdata")
        languages = ["sanskrit", "english"]
        verbose = False
        force = False

    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--download", help="Download Location")
    parser.add_argument("-i", "--install", help="Install Location")
    parser.add_argument("-l", "--languages", nargs="+", help="Languages")
    parser.add_argument("-v", "--verbose", action='store_true',
                        help="Verbose")
    parser.add_argument("-f", "--force", action='store_true',
                        help="Force Download")
    args = parser.parse_args(namespace=Config())

    # ----------------------------------------------------------------------- #

    install_dir = args.install
    download_dir = args.download
    languages = args.languages
    force_download = args.force
    verbose = args.verbose

    # ----------------------------------------------------------------------- #

    vprint = print if verbose else vprint

    for language in languages:
        if language not in INDEX_LIST:
            vprint(f"Error: Language '{language}' not found.")
            continue

        vprint("============================================")
        vprint(f"Language: {language.title()}")
        download_dictionaries(INDEX_BASE, INDEX_LIST, INDEX_SUFFIX, language,
                              download_dir, install_dir, maxcount=-1,
                              force_download=force_download)
        vprint("============================================")

###############################################################################
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Sat Apr 24 19:25:34 2021

	@author: Hrishikesh Terdalkar

	Original: https://github.com/sanskrit-coders/pydictupdater
	"""

	###############################################################################

	import os
	import tarfile
	from urllib.request import urlopen
	from urllib.error import URLError, HTTPError

	###############################################################################

	INDEX_BASE = "https://raw.githubusercontent.com/indic-dict/"
	INDEX_SUFFIX = "tars/tars.MD"

	INDEX_LIST = {
	"sanskrit": [
	"stardict-sanskrit/gh-pages/sa-head/sa-entries/",
	"stardict-sanskrit/gh-pages/sa-head/en-entries/",
	"stardict-sanskrit/gh-pages/en-head/",
	"stardict-sanskrit-vyAkaraNa/gh-pages/",
	"stardict-sanskrit-kAvya/gh-pages/",
	],
	"marathi": [
	"stardict-marathi/gh-pages/ma-head/ma-entries/",
	"stardict-marathi/gh-pages/ma-head/other-entries/"
	],
	"bengali": [
	"stardict-bengali/gh-pages/bn-head/bn-entries/",
	"stardict-bengali/gh-pages/bn-head/en-entries/"
	],
	"hindi": [
	"stardict-hindi/gh-pages/hi-head/hi-entries/",
	"stardict-hindi/gh-pages/hi-head/en-entries/",
	"stardict-hindi/gh-pages/en-head/",
	],
	"english": [
	"stardict-english/gh-pages/en-head/en-entries/",
	"stardict-english/gh-pages/en-head/other-entries/",
	"stardict-english/gh-pages/other-head/"
	]
	}

	###############################################################################


	def vprint(args, *kwargs):
	pass

	###############################################################################
	# download the url into dir
	# if dir does not exist create it


	def download_file(url, dir, force_download=True):
	os.makedirs(dir, exist_ok=True)
	# Open the url
	try:
	f = urlopen(url)
	localpath = os.path.join(dir, os.path.basename(url))
	# Open our local file for writing
	if not force_download:
	if os.path.isfile(localpath): # check if this file exists
	print("Skipped '{localpath}' as it already exists")
	return
	with open(localpath, "wb") as local_file:
	local_file.write(f.read())

	# handle errors
	except HTTPError as e:
	print("HTTP Error:", e.code, url)
	except URLError as e:
	print("URL Error:", e.reason, url)


	# take an index_url and return list of .tar.gz listed in it
	def get_list_of_download_files(index_url):
	encoding = 'utf-8'
	returnlist = []

	vprint(f"Processing index '{index_url}' ...")
	# download this index and go through it line by line
	response = urlopen(index_url)
	for line in response:
	line = line.rstrip() # remove line marker
	# dict_url is a URL to a .tar.gz file
	dict_url = line.decode(encoding)
	returnlist.append(dict_url)
	return returnlist


	def download_and_extract_dictionary(dict_url, download_dir, extract_dir,
	force_download=False):
	dictfilename = os.path.basename(dict_url)
	vprint(f"Downloading '{dictfilename}', to '{download_dir}' ...")
	download_file(dict_url, download_dir, force_download)
	# assert(dictfilename[-7:] == ".tar.gz", dictfilename)
	t = tarfile.open(os.path.join(download_dir, dictfilename), 'r')

	# thedictfilenamelen = len(dictfilename)
	# Handle filenames like: kRdanta-rUpa-mAlA__2016-02-20_23-22-27
	sub_dirname_to_extract = dictfilename[:-8].split("__")[0]
	full_path_of_subdir = os.path.join(extract_dir, sub_dirname_to_extract)
	vprint(f"Extracting to '{full_path_of_subdir}' ...")
	t.extractall(full_path_of_subdir)


	def download_dictionaries(index_base, index_list, index_suffix, language,
	tgz_download_dir, dict_extract_dir,
	maxcount=1, force_download=False):
	count = 0
	for index_url in index_list[language]:
	full_index_path = index_base + index_url + index_suffix
	# download this index
	vprint("============================================")
	vprint(f"Downloading index '{full_index_path}'.")

	dictlist = get_list_of_download_files(full_index_path)
	for adict in dictlist:
	download_and_extract_dictionary(adict, tgz_download_dir,
	dict_extract_dir, force_download)
	count += 1
	if count == -1:
	continue # no limit to download
	if count == maxcount:
	return
	vprint("============================================")


	def get_master_list_to_download(base, index_list):
	masterlist = []
	for language_urls in index_list.values():
	for index_url in language_urls:
	full_index_path = base + index_url
	# download this index
	vprint("============================================")
	vprint("Fetching index '{full_index_path}' ...")
	dictlist = get_list_of_download_files(full_index_path)
	masterlist.extend(dictlist)
	vprint("============================================")
	return masterlist

	###############################################################################


	if __name__ == '__main__':
	import argparse
	import tempfile

	home_dir = os.path.expanduser('~')

	class Config:
	download = os.path.join(tempfile.gettempdir(), "dictdata")
	install = os.path.join(home_dir, "dictdata")
	languages = ["sanskrit", "english"]
	verbose = False
	force = False

	parser = argparse.ArgumentParser()
	parser.add_argument("-d", "--download", help="Download Location")
	parser.add_argument("-i", "--install", help="Install Location")
	parser.add_argument("-l", "--languages", nargs="+", help="Languages")
	parser.add_argument("-v", "--verbose", action='store_true',
	help="Verbose")
	parser.add_argument("-f", "--force", action='store_true',
	help="Force Download")
	args = parser.parse_args(namespace=Config())

	# ----------------------------------------------------------------------- #

	install_dir = args.install
	download_dir = args.download
	languages = args.languages
	force_download = args.force
	verbose = args.verbose

	# ----------------------------------------------------------------------- #

	vprint = print if verbose else vprint

	for language in languages:
	if language not in INDEX_LIST:
	vprint(f"Error: Language '{language}' not found.")
	continue

	vprint("============================================")
	vprint(f"Language: {language.title()}")
	download_dictionaries(INDEX_BASE, INDEX_LIST, INDEX_SUFFIX, language,
	download_dir, install_dir, maxcount=-1,
	force_download=force_download)
	vprint("============================================")

	###############################################################################