Skip to content

Instantly share code, notes, and snippets.

@bradbeattie
Last active December 14, 2015 23:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bradbeattie/5166997 to your computer and use it in GitHub Desktop.
Save bradbeattie/5166997 to your computer and use it in GitHub Desktop.
Translates a given set of French words into English along with English definitions. Shouldn't be to difficult to adapt to other languages. Makes use of a simple two-column sqlite database to cache the API responses. Ignores duplicate words, obsolete words, etc.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import random
import collections
import argparse
import json
import re
import requests
import sqlite3
import sys
import unicodecsv
desired_categories = ("Pronoun", "Noun", "Verb", "Preposition", "Adjective", "Article", "Adverb", "Contraction", "Numeral", "Phrase", "Proverb")
skipped_categories = ("Derived terms", "Conjugation", "Pronunciation", "Etymology", "Anagrams", "See also", "Alternative forms", "Initialism", "Verb form", "References", "Letter", "Related terms")
database = sqlite3.connect("wiktionary.sqlite")
cursor = database.cursor()
def initialise_database():
try:
cursor.execute("SELECT COUNT(*) FROM responses")
except sqlite3.OperationalError:
cursor.execute("CREATE TABLE responses(url TEXT, response TEXT, PRIMARY KEY(url DESC))")
def get_french_definition_data(word):
data = cached_get(u"http://en.wiktionary.org/w/api.php?action=mobileview&sections=all&format=json&page=%s" % word)
try:
data = json.loads(data)["mobileview"]["sections"]
except KeyError:
return []
french = []
for row in data:
if row.get("toclevel", None) == 1:
if row["line"] == "French":
french.append(row)
elif french:
break
elif french:
french.append(row)
if not french:
possible_translations = set()
for row in data:
if row.get("line", None) == "Translations":
for translation in BeautifulSoup(row["text"]).select("span[lang=fr]"):
possible_translations.add(translation.text)
raise Exception(possible_translations)
return french
def passes_checks(phrase):
return not (
"-person" in phrase.lower()
or search("participle of", phrase)
or search("form of", phrase)
or search("nonstandard spelling of", phrase)
or search("common misspelling of", phrase)
or search("alternative spelling", phrase)
or search("plural of", phrase)
or search("clarification of", phrase)
or "(obsolete" in phrase
or "(archaic" in phrase
or "(ne" in phrase
or "(rare" in phrase
or "(dated" in phrase
)
def get_french_definition_triplets(word, valid_categories, number_of_definitions):
triplets = []
# If the given word isn't french, try it's translations
try:
french = get_french_definition_data(word)
print >> sys.stderr, "[WORD]", word
except Exception as e:
if isinstance(e.args[0], set):
for possible_word in e.args[0]:
if possible_word != word:
triplets.extend(get_french_definition_triplets(possible_word, valid_categories, number_of_definitions))
else:
raise
return triplets
# Otherwise, process the french data
for row in french:
word_type = row["line"]
if word_type.startswith(skipped_categories):
pass
elif word_type.startswith(valid_categories):
word_type = word_type.split(" ")[0]
soup = BeautifulSoup(row["text"])
english_definition = [
re.sub(r"\b(To|A|An|Of)\b", lambda x: x.groups()[0].lower(), subdefinition.text.split("\n")[0])
for subdefinition in soup.select("ol > li")
]
english_definition = "; ".join([
subdefinition
for subdefinition in english_definition
if passes_checks(subdefinition)
][0:number_of_definitions])
if english_definition:
english_extras = [word_type.lower()]
try:
english_extras.append(soup.select("p strong, p b")[0].find_next_sibling().text)
if english_extras[-1] == "(":
english_extras = english_extras[0:1]
except:
pass
english_definition += " (%s)" % ", ".join(english_extras)
if passes_checks(english_definition):
try:
french_word = soup.select("p strong, p b")[0].text
except:
french_word = soup.select("p")[0].text
triplets.append((word_type, french_word, english_definition))
return triplets
def search(pattern, string):
return bool(re.search(r"\b%s\b" % pattern, string, re.IGNORECASE))
def cached_get(url):
try:
response = list(cursor.execute(u"SELECT response FROM responses WHERE url = ?", (url, )))[0][0]
except IndexError:
response = requests.get(url).text
cursor.execute(u"INSERT INTO responses (url, response) VALUES (?, ?)", (url, response))
database.commit()
return response
def get_1750_most_common_words():
titles = set()
soup = BeautifulSoup(cached_get("http://en.wiktionary.org/wiki/Appendix:List_of_the_1750_most_frequently_used_French_words"))
for section in soup.find(id="mw-content-text").find_all("h2") + soup.find(id="mw-content-text").find_all("h3"):
while getattr(section.find_next_sibling(), "name", None) not in ("h2", "h3", None):
section = section.find_next_sibling()
for anchor in section.find_all("a"):
if anchor.attrs["href"].startswith("/wiki/"):
titles.add(anchor.attrs["title"])
return titles
def get_definitions(words, valid_categories, number_of_definitions):
definitions = {}
words = list(words)
random.shuffle(words)
for word in words:
for triplet in get_french_definition_triplets(word, valid_categories, number_of_definitions):
key = (triplet[0], triplet[1])
value = triplet[2]
if key not in definitions.keys() and value not in definitions.values():
definitions[key] = value
return definitions
def write_csv(rows, skipfiles):
kwargs = {
"encoding": "utf-8",
"delimiter": "\t",
"quotechar": "`",
}
skiplines = set()
for skipfile in skipfiles:
for line in unicodecsv.reader(open(skipfile), **kwargs):
skiplines.add(tuple(line))
writer = unicodecsv.writer(sys.stdout, **kwargs)
for key in sorted(rows.keys(), key=lambda x: x[1]):
line = (key[1], rows[key])
if line not in skiplines:
writer.writerow(line)
def flatten(l):
for el in l:
if isinstance(el, collections.Iterable) and not isinstance(el, basestring):
for sub in flatten(el):
yield sub
else:
yield el
def parse_words(words=[], files=[]):
words = set(
word.decode("utf-8").strip()
for word in words
if word.strip()
)
for filename in files:
for line in open(filename).readlines():
words.add(line.decode("utf-8").split("\t")[0].strip())
return set(filter(lambda x: x, words))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Translate a given set of words")
parser.add_argument('--word', nargs='+', default=[])
parser.add_argument('--file', nargs='+', default=[])
parser.add_argument('--skipfile', nargs='+', default=[])
parser.add_argument('--category', nargs='+', default=desired_categories)
parser.add_argument('--definitions', default=2)
initialise_database()
args = parser.parse_args()
words = parse_words(args.word, args.file)
if not words:
words = get_1750_most_common_words()
write_csv(get_definitions(words, tuple(args.category), int(args.definitions)), args.skipfile)
@bradbeattie
Copy link
Author

I've used this script to produce such lists as http://quizlet.com/21061262/wiktionary-professions-flash-cards/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment