bradbeattie/translate_with_wiktionary.py

## translate_with_wiktionary.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import random
import collections
import argparse
import json
import re
import requests
import sqlite3
import sys
import unicodecsv

desired_categories = ("Pronoun", "Noun", "Verb", "Preposition", "Adjective", "Article", "Adverb", "Contraction", "Numeral", "Phrase", "Proverb")
skipped_categories = ("Derived terms", "Conjugation", "Pronunciation", "Etymology", "Anagrams", "See also", "Alternative forms", "Initialism", "Verb form", "References", "Letter", "Related terms")
database = sqlite3.connect("wiktionary.sqlite")
cursor = database.cursor()


def initialise_database():
    try:
        cursor.execute("SELECT COUNT(*) FROM responses")
    except sqlite3.OperationalError:
        cursor.execute("CREATE TABLE responses(url TEXT, response TEXT, PRIMARY KEY(url DESC))")


def get_french_definition_data(word):
    data = cached_get(u"http://en.wiktionary.org/w/api.php?action=mobileview&sections=all&format=json&page=%s" % word)
    try:
        data = json.loads(data)["mobileview"]["sections"]
    except KeyError:
        return []

    french = []
    for row in data:
        if row.get("toclevel", None) == 1:
            if row["line"] == "French":
                french.append(row)
            elif french:
                break
        elif french:
            french.append(row)
    if not french:
        possible_translations = set()
        for row in data:
            if row.get("line", None) == "Translations":
                for translation in BeautifulSoup(row["text"]).select("span[lang=fr]"):
                    possible_translations.add(translation.text)
        raise Exception(possible_translations)
    return french


def passes_checks(phrase):
    return not (
        "-person" in phrase.lower()
        or search("participle of", phrase)
        or search("form of", phrase)
        or search("nonstandard spelling of", phrase)
        or search("common misspelling of", phrase)
        or search("alternative spelling", phrase)
        or search("plural of", phrase)
        or search("clarification of", phrase)
        or "(obsolete" in phrase
        or "(archaic" in phrase
        or "(ne" in phrase
        or "(rare" in phrase
        or "(dated" in phrase
    )


def get_french_definition_triplets(word, valid_categories, number_of_definitions):
    triplets = []

    # If the given word isn't french, try it's translations
    try:
        french = get_french_definition_data(word)
        print >> sys.stderr, "[WORD]", word
    except Exception as e:
        if isinstance(e.args[0], set):
            for possible_word in e.args[0]:
                if possible_word != word:
                    triplets.extend(get_french_definition_triplets(possible_word, valid_categories, number_of_definitions))
        else:
            raise
        return triplets

    # Otherwise, process the french data
    for row in french:
        word_type = row["line"]
        if word_type.startswith(skipped_categories):
            pass
        elif word_type.startswith(valid_categories):
            word_type = word_type.split(" ")[0]
            soup = BeautifulSoup(row["text"])
            english_definition = [
                re.sub(r"\b(To|A|An|Of)\b", lambda x: x.groups()[0].lower(), subdefinition.text.split("\n")[0])
                for subdefinition in soup.select("ol > li")
            ]
            english_definition = "; ".join([
                subdefinition
                for subdefinition in english_definition
                if passes_checks(subdefinition)
            ][0:number_of_definitions])
            if english_definition:
                english_extras = [word_type.lower()]
                try:
                    english_extras.append(soup.select("p strong, p b")[0].find_next_sibling().text)
                    if english_extras[-1] == "(":
                        english_extras = english_extras[0:1]
                except:
                    pass
                english_definition += " (%s)" % ", ".join(english_extras)
                if passes_checks(english_definition):
                    try:
                        french_word = soup.select("p strong, p b")[0].text
                    except:
                        french_word = soup.select("p")[0].text
                    triplets.append((word_type, french_word, english_definition))
    return triplets


def search(pattern, string):
    return bool(re.search(r"\b%s\b" % pattern, string, re.IGNORECASE))


def cached_get(url):
    try:
        response = list(cursor.execute(u"SELECT response FROM responses WHERE url = ?", (url, )))[0][0]
    except IndexError:
        response = requests.get(url).text
        cursor.execute(u"INSERT INTO responses (url, response) VALUES (?, ?)", (url, response))
        database.commit()
    return response


def get_1750_most_common_words():
    titles = set()
    soup = BeautifulSoup(cached_get("http://en.wiktionary.org/wiki/Appendix:List_of_the_1750_most_frequently_used_French_words"))
    for section in soup.find(id="mw-content-text").find_all("h2") + soup.find(id="mw-content-text").find_all("h3"):
        while getattr(section.find_next_sibling(), "name", None) not in ("h2", "h3", None):
            section = section.find_next_sibling()
            for anchor in section.find_all("a"):
                if anchor.attrs["href"].startswith("/wiki/"):
                    titles.add(anchor.attrs["title"])
    return titles


def get_definitions(words, valid_categories, number_of_definitions):
    definitions = {}
    words = list(words)
    random.shuffle(words)
    for word in words:
        for triplet in get_french_definition_triplets(word, valid_categories, number_of_definitions):
            key = (triplet[0], triplet[1])
            value = triplet[2]
            if key not in definitions.keys() and value not in definitions.values():
                definitions[key] = value
    return definitions


def write_csv(rows, skipfiles):
    kwargs = {
        "encoding": "utf-8",
        "delimiter": "\t",
        "quotechar": "`",
    }
    skiplines = set()
    for skipfile in skipfiles:
        for line in unicodecsv.reader(open(skipfile), **kwargs):
            skiplines.add(tuple(line))
    writer = unicodecsv.writer(sys.stdout, **kwargs)
    for key in sorted(rows.keys(), key=lambda x: x[1]):
        line = (key[1], rows[key])
        if line not in skiplines:
            writer.writerow(line)


def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, basestring):
            for sub in flatten(el):
                yield sub
        else:
            yield el


def parse_words(words=[], files=[]):
    words = set(
        word.decode("utf-8").strip()
        for word in words
        if word.strip()
    )
    for filename in files:
        for line in open(filename).readlines():
           words.add(line.decode("utf-8").split("\t")[0].strip())
    return set(filter(lambda x: x, words))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Translate a given set of words")
    parser.add_argument('--word', nargs='+', default=[])
    parser.add_argument('--file', nargs='+', default=[])
    parser.add_argument('--skipfile', nargs='+', default=[])
    parser.add_argument('--category', nargs='+', default=desired_categories)
    parser.add_argument('--definitions', default=2)
    initialise_database()
    args = parser.parse_args()
    words = parse_words(args.word, args.file)
    if not words:
        words = get_1750_most_common_words()
    write_csv(get_definitions(words, tuple(args.category), int(args.definitions)), args.skipfile)
	#!/usr/bin/python
	# -- coding: utf-8 --
	from bs4 import BeautifulSoup
	import random
	import collections
	import argparse
	import json
	import re
	import requests
	import sqlite3
	import sys
	import unicodecsv

	desired_categories = ("Pronoun", "Noun", "Verb", "Preposition", "Adjective", "Article", "Adverb", "Contraction", "Numeral", "Phrase", "Proverb")
	skipped_categories = ("Derived terms", "Conjugation", "Pronunciation", "Etymology", "Anagrams", "See also", "Alternative forms", "Initialism", "Verb form", "References", "Letter", "Related terms")
	database = sqlite3.connect("wiktionary.sqlite")
	cursor = database.cursor()


	def initialise_database():
	try:
	cursor.execute("SELECT COUNT(*) FROM responses")
	except sqlite3.OperationalError:
	cursor.execute("CREATE TABLE responses(url TEXT, response TEXT, PRIMARY KEY(url DESC))")


	def get_french_definition_data(word):
	data = cached_get(u"http://en.wiktionary.org/w/api.php?action=mobileview&sections=all&format=json&page=%s" % word)
	try:
	data = json.loads(data)["mobileview"]["sections"]
	except KeyError:
	return []

	french = []
	for row in data:
	if row.get("toclevel", None) == 1:
	if row["line"] == "French":
	french.append(row)
	elif french:
	break
	elif french:
	french.append(row)
	if not french:
	possible_translations = set()
	for row in data:
	if row.get("line", None) == "Translations":
	for translation in BeautifulSoup(row["text"]).select("span[lang=fr]"):
	possible_translations.add(translation.text)
	raise Exception(possible_translations)
	return french


	def passes_checks(phrase):
	return not (
	"-person" in phrase.lower()
	or search("participle of", phrase)
	or search("form of", phrase)
	or search("nonstandard spelling of", phrase)
	or search("common misspelling of", phrase)
	or search("alternative spelling", phrase)
	or search("plural of", phrase)
	or search("clarification of", phrase)
	or "(obsolete" in phrase
	or "(archaic" in phrase
	or "(ne" in phrase
	or "(rare" in phrase
	or "(dated" in phrase
	)


	def get_french_definition_triplets(word, valid_categories, number_of_definitions):
	triplets = []

	# If the given word isn't french, try it's translations
	try:
	french = get_french_definition_data(word)
	print >> sys.stderr, "[WORD]", word
	except Exception as e:
	if isinstance(e.args[0], set):
	for possible_word in e.args[0]:
	if possible_word != word:
	triplets.extend(get_french_definition_triplets(possible_word, valid_categories, number_of_definitions))
	else:
	raise
	return triplets

	# Otherwise, process the french data
	for row in french:
	word_type = row["line"]
	if word_type.startswith(skipped_categories):
	pass
	elif word_type.startswith(valid_categories):
	word_type = word_type.split(" ")[0]
	soup = BeautifulSoup(row["text"])
	english_definition = [
	re.sub(r"\b(To\|A\|An\|Of)\b", lambda x: x.groups()[0].lower(), subdefinition.text.split("\n")[0])
	for subdefinition in soup.select("ol > li")
	]
	english_definition = "; ".join([
	subdefinition
	for subdefinition in english_definition
	if passes_checks(subdefinition)
	][0:number_of_definitions])
	if english_definition:
	english_extras = [word_type.lower()]
	try:
	english_extras.append(soup.select("p strong, p b")[0].find_next_sibling().text)
	if english_extras[-1] == "(":
	english_extras = english_extras[0:1]
	except:
	pass
	english_definition += " (%s)" % ", ".join(english_extras)
	if passes_checks(english_definition):
	try:
	french_word = soup.select("p strong, p b")[0].text
	except:
	french_word = soup.select("p")[0].text
	triplets.append((word_type, french_word, english_definition))
	return triplets


	def search(pattern, string):
	return bool(re.search(r"\b%s\b" % pattern, string, re.IGNORECASE))


	def cached_get(url):
	try:
	response = list(cursor.execute(u"SELECT response FROM responses WHERE url = ?", (url, )))[0][0]
	except IndexError:
	response = requests.get(url).text
	cursor.execute(u"INSERT INTO responses (url, response) VALUES (?, ?)", (url, response))
	database.commit()
	return response


	def get_1750_most_common_words():
	titles = set()
	soup = BeautifulSoup(cached_get("http://en.wiktionary.org/wiki/Appendix:List_of_the_1750_most_frequently_used_French_words"))
	for section in soup.find(id="mw-content-text").find_all("h2") + soup.find(id="mw-content-text").find_all("h3"):
	while getattr(section.find_next_sibling(), "name", None) not in ("h2", "h3", None):
	section = section.find_next_sibling()
	for anchor in section.find_all("a"):
	if anchor.attrs["href"].startswith("/wiki/"):
	titles.add(anchor.attrs["title"])
	return titles


	def get_definitions(words, valid_categories, number_of_definitions):
	definitions = {}
	words = list(words)
	random.shuffle(words)
	for word in words:
	for triplet in get_french_definition_triplets(word, valid_categories, number_of_definitions):
	key = (triplet[0], triplet[1])
	value = triplet[2]
	if key not in definitions.keys() and value not in definitions.values():
	definitions[key] = value
	return definitions


	def write_csv(rows, skipfiles):
	kwargs = {
	"encoding": "utf-8",
	"delimiter": "\t",
	"quotechar": "`",
	}
	skiplines = set()
	for skipfile in skipfiles:
	for line in unicodecsv.reader(open(skipfile), **kwargs):
	skiplines.add(tuple(line))
	writer = unicodecsv.writer(sys.stdout, **kwargs)
	for key in sorted(rows.keys(), key=lambda x: x[1]):
	line = (key[1], rows[key])
	if line not in skiplines:
	writer.writerow(line)


	def flatten(l):
	for el in l:
	if isinstance(el, collections.Iterable) and not isinstance(el, basestring):
	for sub in flatten(el):
	yield sub
	else:
	yield el


	def parse_words(words=[], files=[]):
	words = set(
	word.decode("utf-8").strip()
	for word in words
	if word.strip()
	)
	for filename in files:
	for line in open(filename).readlines():
	words.add(line.decode("utf-8").split("\t")[0].strip())
	return set(filter(lambda x: x, words))


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Translate a given set of words")
	parser.add_argument('--word', nargs='+', default=[])
	parser.add_argument('--file', nargs='+', default=[])
	parser.add_argument('--skipfile', nargs='+', default=[])
	parser.add_argument('--category', nargs='+', default=desired_categories)
	parser.add_argument('--definitions', default=2)
	initialise_database()
	args = parser.parse_args()
	words = parse_words(args.word, args.file)
	if not words:
	words = get_1750_most_common_words()
	write_csv(get_definitions(words, tuple(args.category), int(args.definitions)), args.skipfile)