Last active
June 20, 2017 01:32
-
-
Save zyocum/4057f2fc1b87c4295a41c5ce2c2883fe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Attempt to generate false etymologies from phonetically similar words""" | |
import csv | |
import json | |
import sys | |
import random | |
from itertools import groupby | |
from metaphone.metaphone import doublemetaphone | |
POS = { | |
'adj.', | |
'adv.', | |
'conj.', | |
'interj.', | |
'n.', | |
'nj.', | |
'pl.', | |
'prep.', | |
'pron.', | |
'v.' | |
} | |
def double_metaphones(words): | |
grouped = groupby(sorted(words, key=doublemetaphone), doublemetaphone) | |
return {dm: list(words) for (dm, words) in grouped} | |
def query(word, pos, etyms): | |
# this potentially throws some results away... | |
results = [ | |
e for e in etyms.get(word) if | |
(pos is None) or (e.get('pos') is None) or (e.get('pos') == pos) | |
] | |
if results: | |
result, *_ = results | |
return result | |
def candidates(word, pos, etyms, dms): | |
# the first is the real etymology | |
real = query(word, pos, etyms) | |
if real is not None: | |
yield real | |
# generate fake etymologies | |
for similar in dms[doublemetaphone(word)]: | |
for sim_etym in etyms[similar]: | |
yield sim_etym | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, | |
description=__doc__ | |
) | |
parser.add_argument( | |
'query', | |
help=( | |
'a word to query for its true etymology and candidate false ' | |
'etymologies' | |
) | |
) | |
parser.add_argument( | |
'-p', '--pos', | |
default=None, | |
choices=POS, | |
help='help' | |
) | |
parser.add_argument( | |
'-e', '--etymologies', | |
type=argparse.FileType('r'), | |
default='etymologies.json', | |
help=( | |
'a JSON file with etymological entries keyed on word strings ' | |
'(e.g., https://www.dropbox.com/s/mlik0dtqv9pf72v/etymology.json)' | |
) | |
) | |
parser.add_argument( | |
'-n', | |
type=int, | |
default=None, | |
help=( | |
'limit results to the first n ' | |
'(all results are returned by default)' | |
) | |
) | |
parser.add_argument( | |
'-s', '--seed', | |
default=None, | |
help='seed string for random initialization', | |
) | |
args = parser.parse_args() | |
writer = csv.writer(sys.stdout, delimiter='\t') | |
# echo the query | |
writer.writerow((args.query, args.pos)) | |
if args.seed is not None: | |
random.seed(args.seed) | |
# compile etymologies dict | |
etyms = json.loads(args.etymologies.read()) | |
# compile double metaphone dict | |
dms = double_metaphones(etyms.keys()) | |
# get real and fake etymology candidates | |
real, *fakes = candidates(args.query, args.pos, etyms, dms) | |
etymologies = list({f['definition'] for f in fakes}) | |
if len(etymologies) <= 1: | |
print('Not enough etymologies!', file=sys.stderr) | |
sys.exit(1) | |
if args.n is not None and args.n <= len(etymologies): | |
etymologies = random.sample(etymologies, k=args.n - 1) | |
etymologies.append(real['definition']) | |
random.shuffle(etymologies) | |
for i, etymology in enumerate(etymologies, 1): | |
writer.writerow((i, etymology)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Requires Metaphone: