Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Translates a given set of French words into English along with English definitions. Shouldn't be to difficult to adapt to other languages. Makes use of a simple two-column sqlite database to cache the API responses. Ignores duplicate words, obsolete words, etc.
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import random
import collections
import argparse
import json
import re
import requests
import sqlite3
import sys
import unicodecsv
desired_categories = ("Pronoun", "Noun", "Verb", "Preposition", "Adjective", "Article", "Adverb", "Contraction", "Numeral", "Phrase", "Proverb")
skipped_categories = ("Derived terms", "Conjugation", "Pronunciation", "Etymology", "Anagrams", "See also", "Alternative forms", "Initialism", "Verb form", "References", "Letter", "Related terms")
database = sqlite3.connect("wiktionary.sqlite")
cursor = database.cursor()
def initialise_database():
cursor.execute("SELECT COUNT(*) FROM responses")
except sqlite3.OperationalError:
cursor.execute("CREATE TABLE responses(url TEXT, response TEXT, PRIMARY KEY(url DESC))")
def get_french_definition_data(word):
data = cached_get(u"" % word)
data = json.loads(data)["mobileview"]["sections"]
except KeyError:
return []
french = []
for row in data:
if row.get("toclevel", None) == 1:
if row["line"] == "French":
elif french:
elif french:
if not french:
possible_translations = set()
for row in data:
if row.get("line", None) == "Translations":
for translation in BeautifulSoup(row["text"]).select("span[lang=fr]"):
raise Exception(possible_translations)
return french
def passes_checks(phrase):
return not (
"-person" in phrase.lower()
or search("participle of", phrase)
or search("form of", phrase)
or search("nonstandard spelling of", phrase)
or search("common misspelling of", phrase)
or search("alternative spelling", phrase)
or search("plural of", phrase)
or search("clarification of", phrase)
or "(obsolete" in phrase
or "(archaic" in phrase
or "(ne" in phrase
or "(rare" in phrase
or "(dated" in phrase
def get_french_definition_triplets(word, valid_categories, number_of_definitions):
triplets = []
# If the given word isn't french, try it's translations
french = get_french_definition_data(word)
print >> sys.stderr, "[WORD]", word
except Exception as e:
if isinstance(e.args[0], set):
for possible_word in e.args[0]:
if possible_word != word:
triplets.extend(get_french_definition_triplets(possible_word, valid_categories, number_of_definitions))
return triplets
# Otherwise, process the french data
for row in french:
word_type = row["line"]
if word_type.startswith(skipped_categories):
elif word_type.startswith(valid_categories):
word_type = word_type.split(" ")[0]
soup = BeautifulSoup(row["text"])
english_definition = [
re.sub(r"\b(To|A|An|Of)\b", lambda x: x.groups()[0].lower(), subdefinition.text.split("\n")[0])
for subdefinition in"ol > li")
english_definition = "; ".join([
for subdefinition in english_definition
if passes_checks(subdefinition)
if english_definition:
english_extras = [word_type.lower()]
english_extras.append("p strong, p b")[0].find_next_sibling().text)
if english_extras[-1] == "(":
english_extras = english_extras[0:1]
english_definition += " (%s)" % ", ".join(english_extras)
if passes_checks(english_definition):
french_word ="p strong, p b")[0].text
french_word ="p")[0].text
triplets.append((word_type, french_word, english_definition))
return triplets
def search(pattern, string):
return bool("\b%s\b" % pattern, string, re.IGNORECASE))
def cached_get(url):
response = list(cursor.execute(u"SELECT response FROM responses WHERE url = ?", (url, )))[0][0]
except IndexError:
response = requests.get(url).text
cursor.execute(u"INSERT INTO responses (url, response) VALUES (?, ?)", (url, response))
return response
def get_1750_most_common_words():
titles = set()
soup = BeautifulSoup(cached_get(""))
for section in soup.find(id="mw-content-text").find_all("h2") + soup.find(id="mw-content-text").find_all("h3"):
while getattr(section.find_next_sibling(), "name", None) not in ("h2", "h3", None):
section = section.find_next_sibling()
for anchor in section.find_all("a"):
if anchor.attrs["href"].startswith("/wiki/"):
return titles
def get_definitions(words, valid_categories, number_of_definitions):
definitions = {}
words = list(words)
for word in words:
for triplet in get_french_definition_triplets(word, valid_categories, number_of_definitions):
key = (triplet[0], triplet[1])
value = triplet[2]
if key not in definitions.keys() and value not in definitions.values():
definitions[key] = value
return definitions
def write_csv(rows, skipfiles):
kwargs = {
"encoding": "utf-8",
"delimiter": "\t",
"quotechar": "`",
skiplines = set()
for skipfile in skipfiles:
for line in unicodecsv.reader(open(skipfile), **kwargs):
writer = unicodecsv.writer(sys.stdout, **kwargs)
for key in sorted(rows.keys(), key=lambda x: x[1]):
line = (key[1], rows[key])
if line not in skiplines:
def flatten(l):
for el in l:
if isinstance(el, collections.Iterable) and not isinstance(el, basestring):
for sub in flatten(el):
yield sub
yield el
def parse_words(words=[], files=[]):
words = set(
for word in words
if word.strip()
for filename in files:
for line in open(filename).readlines():
return set(filter(lambda x: x, words))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Translate a given set of words")
parser.add_argument('--word', nargs='+', default=[])
parser.add_argument('--file', nargs='+', default=[])
parser.add_argument('--skipfile', nargs='+', default=[])
parser.add_argument('--category', nargs='+', default=desired_categories)
parser.add_argument('--definitions', default=2)
args = parser.parse_args()
words = parse_words(args.word, args.file)
if not words:
words = get_1750_most_common_words()
write_csv(get_definitions(words, tuple(args.category), int(args.definitions)), args.skipfile)

This comment has been minimized.

Copy link
Owner Author

commented Mar 18, 2013

I've used this script to produce such lists as

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.