Skip to content

Instantly share code, notes, and snippets.

@zyocum
Last active June 20, 2017 01:32
Show Gist options
  • Save zyocum/4057f2fc1b87c4295a41c5ce2c2883fe to your computer and use it in GitHub Desktop.
Save zyocum/4057f2fc1b87c4295a41c5ce2c2883fe to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Attempt to generate false etymologies from phonetically similar words"""
import csv
import json
import sys
import random
from itertools import groupby
from metaphone.metaphone import doublemetaphone
POS = {
'adj.',
'adv.',
'conj.',
'interj.',
'n.',
'nj.',
'pl.',
'prep.',
'pron.',
'v.'
}
def double_metaphones(words):
grouped = groupby(sorted(words, key=doublemetaphone), doublemetaphone)
return {dm: list(words) for (dm, words) in grouped}
def query(word, pos, etyms):
# this potentially throws some results away...
results = [
e for e in etyms.get(word) if
(pos is None) or (e.get('pos') is None) or (e.get('pos') == pos)
]
if results:
result, *_ = results
return result
def candidates(word, pos, etyms, dms):
# the first is the real etymology
real = query(word, pos, etyms)
if real is not None:
yield real
# generate fake etymologies
for similar in dms[doublemetaphone(word)]:
for sim_etym in etyms[similar]:
yield sim_etym
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=__doc__
)
parser.add_argument(
'query',
help=(
'a word to query for its true etymology and candidate false '
'etymologies'
)
)
parser.add_argument(
'-p', '--pos',
default=None,
choices=POS,
help='help'
)
parser.add_argument(
'-e', '--etymologies',
type=argparse.FileType('r'),
default='etymologies.json',
help=(
'a JSON file with etymological entries keyed on word strings '
'(e.g., https://www.dropbox.com/s/mlik0dtqv9pf72v/etymology.json)'
)
)
parser.add_argument(
'-n',
type=int,
default=None,
help=(
'limit results to the first n '
'(all results are returned by default)'
)
)
parser.add_argument(
'-s', '--seed',
default=None,
help='seed string for random initialization',
)
args = parser.parse_args()
writer = csv.writer(sys.stdout, delimiter='\t')
# echo the query
writer.writerow((args.query, args.pos))
if args.seed is not None:
random.seed(args.seed)
# compile etymologies dict
etyms = json.loads(args.etymologies.read())
# compile double metaphone dict
dms = double_metaphones(etyms.keys())
# get real and fake etymology candidates
real, *fakes = candidates(args.query, args.pos, etyms, dms)
etymologies = list({f['definition'] for f in fakes})
if len(etymologies) <= 1:
print('Not enough etymologies!', file=sys.stderr)
sys.exit(1)
if args.n is not None and args.n <= len(etymologies):
etymologies = random.sample(etymologies, k=args.n - 1)
etymologies.append(real['definition'])
random.shuffle(etymologies)
for i, etymology in enumerate(etymologies, 1):
writer.writerow((i, etymology))
@zyocum
Copy link
Author

zyocum commented Jun 20, 2017

Requires Metaphone:

pip3 install Metaphone

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment