Last active
February 8, 2024 18:06
-
-
Save s03311251/bb54f41186404ddd0d6627f35cef4413 to your computer and use it in GitHub Desktop.
Replace Wikipedia links in a given language with links in another language using the Wikipedia API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mwparserfromhell | |
import sys | |
translation_title = { | |
'Infobox Unternehmen': 'infobox company', | |
'Infobox Connector': 'infobox connector' | |
} | |
translation_param = { | |
'Infobox Unternehmen': { | |
'Name': 'name', | |
'Logo': 'logo', | |
'Unternehmensform': 'type', | |
'Gründungsdatum': 'founded', | |
'Sitz': 'hq_location', | |
'Leitung': 'key_people', | |
'Mitarbeiterzahl': 'num_employees', | |
'Umsatz': 'revenue', | |
'Stand': 'revenue_year', | |
'Branche': 'industry', | |
'Homepage': 'website' | |
}, | |
'Infobox Connector': { | |
'Name': 'connector_name', | |
} | |
} | |
def translate_template(template): | |
translated_template_name = translation_title.get(template.name.strip(), template.name.strip()) | |
translated_template = f"{{{{{translated_template_name}\n" | |
for param in template.params: | |
template_mapping = translation_param.get(template.name.strip(), {}) | |
if param.name.strip() in template_mapping: | |
translated_name = template_mapping[param.name.strip()] | |
translated_value = param.value | |
max_param_length = max(len(name) for name in template_mapping.values()) | |
translated_template += f"| {translated_name.ljust(max_param_length)} = {translated_value}" | |
translated_template += "\n}}" | |
return translated_template | |
def translate_wiki_text(wiki_text): | |
wikicode = mwparserfromhell.parse(wiki_text) | |
for template in wikicode.filter_templates(): | |
translated_template_name = translation_title.get(template.name.strip(), template.name.strip()) | |
if template.name.strip() in translation_param: | |
translated_template = translate_template(template) | |
wikicode.replace(template, mwparserfromhell.parse(translated_template)) | |
return str(wikicode) | |
if __name__ == "__main__": | |
# Read from stdin | |
input_wiki_text = sys.stdin.read() | |
# Translate wiki text | |
translated_wiki_text = translate_wiki_text(input_wiki_text) | |
# Write to stdout | |
sys.stdout.write(translated_wiki_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
Example use: python wiki_link_other_lang.py -from en -to zh-yue -alt zh < input.txt > output.txt | |
Mostly written by ChatGPT, but I debugged part of it, added -alt option and concurrent afterwards. Here's what I've asked ChatGPT to write: | |
> Please write me a Python script that: | |
> 1. accept 2 arguments "to" and "from"; "to" is the language that Wikipedia articles originally are (I will call it "from_lang" below), and "from" is the target language of the Wikipedia articles (I will call it "to_lang" below); | |
> 2. get input text from stdin, and write the output to stdout; | |
> 3. find all Wikipedia article name in from_lang inside square brackets (in the format [[<article name>]] or [[<article name>|<alt text>]]); | |
> 4. if the articles exist in to_lang Wikipedia, replace the text in brackets in the article name in to_lang; | |
> 5. if the articles doesn't exist in to_lang Wikipedia, do the following: | |
> 5a. if it is formatted as [[<article name>]] in input, replace with {{link-<from_lang>|<article name>}} | |
> 5b. if it is formatted as [[<article name>|<alt text>]] in input, replace with {{link-<from_lang>|<article name>|<article name>|<alt text>}} | |
> 6. print a log that a string has been replaced | |
> there is no main function in the python script | |
> please change the command line format such that it accepts agruments like this: python wikipedia_langlinks.py -in=en -out=ja | |
> for the code above, please also include a user-agent such that Wikipedia can contact me when there is a problem | |
""" | |
import argparse | |
import sys | |
import re | |
import requests | |
import concurrent.futures | |
headers = { | |
"User-Agent": "wikipedia_langlinks.py/1.4 (s03311251@hotmail.com)", | |
} | |
def get_langlinks(article_name, from_lang, to_lang): | |
""" | |
Get the language links for the given titles from the from_lang Wikipedia to the to_lang Wikipedia. | |
Returns: | |
bool: True if article_name exists in from_lang. | |
string: article name in to_lang; None if not exist in to_lang. | |
""" | |
url = f"https://{from_lang}.wikipedia.org/w/api.php?action=query&titles={article_name}&prop=langlinks&lllang={to_lang}&format=json" | |
if article_name == None: | |
return False, None | |
try: | |
json_data = requests.get(url, headers=headers).json() | |
pages = json_data["query"]["pages"] | |
for key in pages.keys(): | |
if key == "-1": | |
# page not exist in from_lang | |
return False, None | |
page = pages[key] | |
if "langlinks" in page: | |
langlinks = page["langlinks"] | |
for langlink in langlinks: | |
if langlink["lang"] == to_lang: | |
new_article_name = langlink["*"] | |
return True, langlink["*"] | |
except Exception as ex: | |
sys.stderr.write( | |
f"Error on {article_name}: {type(ex).__name__} {ex.args}\r\n") | |
return False, None | |
return True, None | |
def get_redirect(article_name, lang): | |
url = f"https://{lang}.wikipedia.org/w/api.php?action=query&titles={article_name}&redirects=1&format=json" | |
try: | |
json_data = requests.get(url, headers=headers).json() | |
target_page = json_data['query']['redirects'][0]['to'] | |
return target_page | |
except Exception as ex: | |
return None | |
def replace_links(match, from_lang, to_lang, alt_lang): | |
if match.group(1): | |
# pattern [[article_name]] | |
article_name = match.group(1) | |
alt_text = "" | |
section_name = "" | |
elif match.group(2): | |
# pattern [[article_name|alt_text]] | |
article_name = match.group(2) | |
alt_text = match.group(3) | |
section_name = "" | |
else: | |
# pattern [[article_name#section_name|alt_text]] | |
article_name = match.group(4) | |
alt_text = match.group(6) | |
section_name = match.group(5) | |
redirected_name = get_redirect(article_name, from_lang) | |
redirected_msg = f"{redirected_name} (Redirected) -> " if redirected_name else "" | |
exist_in_from_lang, new_article_name = get_langlinks(redirected_name if redirected_name else article_name, from_lang, to_lang) | |
# page not exist in from_lang | |
if not exist_in_from_lang: | |
sys.stderr.write( | |
f"Not replaced: {match.group()} - {article_name} not exist in {from_lang}\r\n" | |
) | |
return match.group() | |
# exist in to_lang | |
if new_article_name != None: | |
# new_text = "" | |
if section_name != "": | |
new_text = f"[[{new_article_name}#{section_name}|{alt_text}]]" | |
sys.stderr.write(f"WARNING: check if section {section_name} exists in {new_article_name} on {to_lang} wiki\r\n") | |
elif alt_text != "": | |
new_text = f"[[{new_article_name}|{alt_text}]]" | |
else: | |
new_text = f"[[{new_article_name}]]" | |
sys.stderr.write(f"Replaced: {match.group()} -> {redirected_msg}{new_text}\r\n") | |
return new_text | |
# search for alt_lang too | |
_, new_article_name = get_langlinks(redirected_name if redirected_name else article_name, from_lang, alt_lang) | |
# exist in alt_lang | |
if new_article_name != None: | |
if section_name != "": | |
new_text = f"{{{{link-{alt_lang}|{new_article_name}|{new_article_name}#{section_name}|{alt_text}}}}}" | |
sys.stderr.write(f"WARNING: check if section {section_name} exists in {new_article_name} on {alt_lang} wiki\r\n") | |
elif alt_text != "": | |
new_text = f"{{{{link-{alt_lang}|{new_article_name}|{new_article_name}|{alt_text}}}}}" | |
else: | |
new_text = f"{{{{link-{alt_lang}|{new_article_name}}}}}" | |
sys.stderr.write(f"Replaced: {match.group()} -> {redirected_msg}{new_text}\r\n") | |
return new_text | |
# not exist in alt_lang | |
if section_name != "": | |
new_text = f"{{{{link-{from_lang}|{article_name}|{article_name}#{section_name}|{alt_text}}}}}" | |
elif alt_text != "": | |
new_text = f"{{{{link-{from_lang}|{article_name}|3={alt_text}}}}}" | |
else: | |
new_text = f"{{{{link-{from_lang}|{article_name}}}}}" | |
sys.stderr.write(f"Replaced: {match.group()} -> {redirected_msg}{new_text}\r\n") | |
return new_text | |
def main(args): | |
from_lang = args.from_lang | |
to_lang = args.to_lang | |
alt_lang = args.alt_lang | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
futures = [] | |
for line in sys.stdin: | |
# exclude links with the following prefixes | |
exc_prefix = ["File", "Datei", "Category", "Kategorie"] | |
# exclude special characters | |
# <https://en.wikipedia.org/wiki/Wikipedia:Article_titles#Special_characters> | |
exc_char = r"#<>\[\]\|\{\}" | |
# match patterns "[[xxx]]", "[[xxx|yyy]]" or "'[[xxx#zzz|yyy]]" | |
patterns = [ | |
r"\[\[(?!" + "|".join(exc_prefix) + ")([^" + exc_char + "]+)\]\]", | |
r"\[\[(?!" + "|".join(exc_prefix) + ")([^" + exc_char + "]+)\|([^" + exc_char + "]+)\]\]", | |
r"\[\[(?!" + "|".join(exc_prefix) + ")([^" + exc_char + "]+)#([^" + exc_char + "]+)\|([^" + exc_char + "]+)\]\]" | |
] | |
# Submit the function to the executor and store the future objects in a list | |
futures.append( | |
executor.submit( | |
re.sub, "|".join(patterns), lambda match: replace_links( | |
match, from_lang, to_lang, alt_lang), line)) | |
# Iterate over the zipped list of futures to collect the results in order | |
# results = [] | |
for future in futures: | |
sys.stdout.write(future.result()) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description= | |
"Replace Wikipedia links in a given language with links in another language using the Wikipedia API." | |
) | |
parser.add_argument( | |
"-from", | |
dest="from_lang", | |
required=True, | |
help="The language code of the input text (e.g. en for English)") | |
parser.add_argument( | |
"-to", | |
dest="to_lang", | |
required=True, | |
help="The language code of the output text (e.g. es for Spanish)") | |
parser.add_argument( | |
"-alt", | |
dest="alt_lang", | |
required=False, | |
help= | |
"Alternative language code of the output text if the language specified in \"-to\" is unavailable" | |
) | |
args = parser.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment