Last active
December 14, 2015 23:39
-
-
Save bradbeattie/5166997 to your computer and use it in GitHub Desktop.
Translates a given set of French words into English along with English definitions. Shouldn't be to difficult to adapt to other languages. Makes use of a simple two-column sqlite database to cache the API responses. Ignores duplicate words, obsolete words, etc.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from bs4 import BeautifulSoup | |
import random | |
import collections | |
import argparse | |
import json | |
import re | |
import requests | |
import sqlite3 | |
import sys | |
import unicodecsv | |
desired_categories = ("Pronoun", "Noun", "Verb", "Preposition", "Adjective", "Article", "Adverb", "Contraction", "Numeral", "Phrase", "Proverb") | |
skipped_categories = ("Derived terms", "Conjugation", "Pronunciation", "Etymology", "Anagrams", "See also", "Alternative forms", "Initialism", "Verb form", "References", "Letter", "Related terms") | |
database = sqlite3.connect("wiktionary.sqlite") | |
cursor = database.cursor() | |
def initialise_database(): | |
try: | |
cursor.execute("SELECT COUNT(*) FROM responses") | |
except sqlite3.OperationalError: | |
cursor.execute("CREATE TABLE responses(url TEXT, response TEXT, PRIMARY KEY(url DESC))") | |
def get_french_definition_data(word): | |
data = cached_get(u"http://en.wiktionary.org/w/api.php?action=mobileview§ions=all&format=json&page=%s" % word) | |
try: | |
data = json.loads(data)["mobileview"]["sections"] | |
except KeyError: | |
return [] | |
french = [] | |
for row in data: | |
if row.get("toclevel", None) == 1: | |
if row["line"] == "French": | |
french.append(row) | |
elif french: | |
break | |
elif french: | |
french.append(row) | |
if not french: | |
possible_translations = set() | |
for row in data: | |
if row.get("line", None) == "Translations": | |
for translation in BeautifulSoup(row["text"]).select("span[lang=fr]"): | |
possible_translations.add(translation.text) | |
raise Exception(possible_translations) | |
return french | |
def passes_checks(phrase): | |
return not ( | |
"-person" in phrase.lower() | |
or search("participle of", phrase) | |
or search("form of", phrase) | |
or search("nonstandard spelling of", phrase) | |
or search("common misspelling of", phrase) | |
or search("alternative spelling", phrase) | |
or search("plural of", phrase) | |
or search("clarification of", phrase) | |
or "(obsolete" in phrase | |
or "(archaic" in phrase | |
or "(ne" in phrase | |
or "(rare" in phrase | |
or "(dated" in phrase | |
) | |
def get_french_definition_triplets(word, valid_categories, number_of_definitions): | |
triplets = [] | |
# If the given word isn't french, try it's translations | |
try: | |
french = get_french_definition_data(word) | |
print >> sys.stderr, "[WORD]", word | |
except Exception as e: | |
if isinstance(e.args[0], set): | |
for possible_word in e.args[0]: | |
if possible_word != word: | |
triplets.extend(get_french_definition_triplets(possible_word, valid_categories, number_of_definitions)) | |
else: | |
raise | |
return triplets | |
# Otherwise, process the french data | |
for row in french: | |
word_type = row["line"] | |
if word_type.startswith(skipped_categories): | |
pass | |
elif word_type.startswith(valid_categories): | |
word_type = word_type.split(" ")[0] | |
soup = BeautifulSoup(row["text"]) | |
english_definition = [ | |
re.sub(r"\b(To|A|An|Of)\b", lambda x: x.groups()[0].lower(), subdefinition.text.split("\n")[0]) | |
for subdefinition in soup.select("ol > li") | |
] | |
english_definition = "; ".join([ | |
subdefinition | |
for subdefinition in english_definition | |
if passes_checks(subdefinition) | |
][0:number_of_definitions]) | |
if english_definition: | |
english_extras = [word_type.lower()] | |
try: | |
english_extras.append(soup.select("p strong, p b")[0].find_next_sibling().text) | |
if english_extras[-1] == "(": | |
english_extras = english_extras[0:1] | |
except: | |
pass | |
english_definition += " (%s)" % ", ".join(english_extras) | |
if passes_checks(english_definition): | |
try: | |
french_word = soup.select("p strong, p b")[0].text | |
except: | |
french_word = soup.select("p")[0].text | |
triplets.append((word_type, french_word, english_definition)) | |
return triplets | |
def search(pattern, string): | |
return bool(re.search(r"\b%s\b" % pattern, string, re.IGNORECASE)) | |
def cached_get(url): | |
try: | |
response = list(cursor.execute(u"SELECT response FROM responses WHERE url = ?", (url, )))[0][0] | |
except IndexError: | |
response = requests.get(url).text | |
cursor.execute(u"INSERT INTO responses (url, response) VALUES (?, ?)", (url, response)) | |
database.commit() | |
return response | |
def get_1750_most_common_words(): | |
titles = set() | |
soup = BeautifulSoup(cached_get("http://en.wiktionary.org/wiki/Appendix:List_of_the_1750_most_frequently_used_French_words")) | |
for section in soup.find(id="mw-content-text").find_all("h2") + soup.find(id="mw-content-text").find_all("h3"): | |
while getattr(section.find_next_sibling(), "name", None) not in ("h2", "h3", None): | |
section = section.find_next_sibling() | |
for anchor in section.find_all("a"): | |
if anchor.attrs["href"].startswith("/wiki/"): | |
titles.add(anchor.attrs["title"]) | |
return titles | |
def get_definitions(words, valid_categories, number_of_definitions): | |
definitions = {} | |
words = list(words) | |
random.shuffle(words) | |
for word in words: | |
for triplet in get_french_definition_triplets(word, valid_categories, number_of_definitions): | |
key = (triplet[0], triplet[1]) | |
value = triplet[2] | |
if key not in definitions.keys() and value not in definitions.values(): | |
definitions[key] = value | |
return definitions | |
def write_csv(rows, skipfiles): | |
kwargs = { | |
"encoding": "utf-8", | |
"delimiter": "\t", | |
"quotechar": "`", | |
} | |
skiplines = set() | |
for skipfile in skipfiles: | |
for line in unicodecsv.reader(open(skipfile), **kwargs): | |
skiplines.add(tuple(line)) | |
writer = unicodecsv.writer(sys.stdout, **kwargs) | |
for key in sorted(rows.keys(), key=lambda x: x[1]): | |
line = (key[1], rows[key]) | |
if line not in skiplines: | |
writer.writerow(line) | |
def flatten(l): | |
for el in l: | |
if isinstance(el, collections.Iterable) and not isinstance(el, basestring): | |
for sub in flatten(el): | |
yield sub | |
else: | |
yield el | |
def parse_words(words=[], files=[]): | |
words = set( | |
word.decode("utf-8").strip() | |
for word in words | |
if word.strip() | |
) | |
for filename in files: | |
for line in open(filename).readlines(): | |
words.add(line.decode("utf-8").split("\t")[0].strip()) | |
return set(filter(lambda x: x, words)) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Translate a given set of words") | |
parser.add_argument('--word', nargs='+', default=[]) | |
parser.add_argument('--file', nargs='+', default=[]) | |
parser.add_argument('--skipfile', nargs='+', default=[]) | |
parser.add_argument('--category', nargs='+', default=desired_categories) | |
parser.add_argument('--definitions', default=2) | |
initialise_database() | |
args = parser.parse_args() | |
words = parse_words(args.word, args.file) | |
if not words: | |
words = get_1750_most_common_words() | |
write_csv(get_definitions(words, tuple(args.category), int(args.definitions)), args.skipfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I've used this script to produce such lists as http://quizlet.com/21061262/wiktionary-professions-flash-cards/