Skip to content

Instantly share code, notes, and snippets.

@santhoshtr
Created November 20, 2023 04:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save santhoshtr/20548fa75687b266e098a5f62daea74e to your computer and use it in GitHub Desktop.
Save santhoshtr/20548fa75687b266e098a5f62daea74e to your computer and use it in GitHub Desktop.
Print a Tab seperated file with all languages supported by MT providers of WMF Cxserver
# Print a Tab seperated file with all languages supported by MT providers
import requests
from typing import List
mtlabels = {
"Apertium": "Ⓐ",
"Elia": "Ⓔ",
"Google": "Ⓖ",
"MinT": "Ⓜ",
"Yandex": "Ⓨ",
"LingoCloud": "Ⓛ",
}
def get_wiki_sites(project: str = "wiki") -> List[str]:
"""Get all language codes for a given Wikimedia project.
Valid project codes:
* wiki = Wikipedia
* wiktionary = Wiktionary
* wikibooks = Wikibooks
* wikinews = Wikinews
* wikiquote = Wikiquote
* wikisource = Wikisource
* wikiversity = Wikiversity
* wikivoyage = Wikivoyage
"""
session = requests.Session()
base_url = "https://meta.wikimedia.org/w/api.php"
params = {
"action": "sitematrix",
"smlangprop": "|".join(["code", "site"]),
"smsiteprop": "|".join(["code"]),
"format": "json",
"formatversion": "2",
}
result = session.get(url=base_url, params=params).json()
wiki_languages = set()
if "sitematrix" in result:
for lang in result["sitematrix"]:
try:
int(lang) # weirdly, wikis are keyed as numbers in the results
for wiki in result["sitematrix"][lang].get("site", []):
if "closed" not in wiki and wiki["code"] == project:
code = result["sitematrix"][lang]["code"]
if code == "simple":
# Simple is English language
continue
wiki_languages.add(code)
break
except (
ValueError
): # skip count metadata and special wikis like Commons, Affiliates, etc.
continue
return sorted(wiki_languages)
def get_mt_coverage_info():
# Send a GET request to the API endpoint
response = requests.get("https://cxserver.wikimedia.org/v1/list/mt")
# get the JSON data returned by the API
data = response.json()
coverage = {}
# loop through the key-value pairs of the data
for provider, value in data.items():
if provider == "defaults" or provider == "TestClient" or provider == "Youdao" :
continue
# loop through the sub key-value pairs of the current key-value pair
for source_lang, target_langs in value.items():
if source_lang not in coverage:
coverage[source_lang] = {}
for target_lang in target_langs:
if target_lang not in coverage[source_lang]:
coverage[source_lang][target_lang] = []
coverage[source_lang][target_lang].append(mtlabels[provider])
return coverage
def print_mt_coverage(show_providers=False):
all_langs = sorted(set(get_wiki_sites()))
coverage = get_mt_coverage_info()
print("\t", end="")
for lang in all_langs:
print(f"{lang}", end="\t")
print("")
for source_lang in all_langs:
print(f"{source_lang}", end="\t")
for target_lang in all_langs:
providers = []
if source_lang in coverage and target_lang in coverage[source_lang]:
providers = coverage[source_lang][target_lang]
if show_providers:
print(f"{''.join(providers)}", end="\t")
else:
if len(providers) > 0:
print("✅", end="\t")
else:
print("", end="\t")
print("")
if __name__ == "__main__":
print_mt_coverage(show_providers=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment