Last active
September 2, 2018 21:25
-
-
Save asimihsan/8239189 to your computer and use it in GitHub Desktop.
NSA codeword generator.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
HOMOGENAKA | |
SPALLYILL | |
ENEUCHSTROKY | |
BOBACHUELESS | |
SYNAPTEPIERAGE | |
HUMBLEENCRATY | |
EMITTERSOMEDAY | |
BUGLETINNLESS | |
BEAUPAINTER | |
TOXEMIASHALLOW | |
SANDANBOXER | |
WARMFULTEASER | |
TERSESTINGY | |
MALEOBEGLOZE | |
KENNERCHOLLER | |
PROPLEXUNHUMAN | |
ATROPICGABY | |
SCENICSURGE | |
OPTICASHY | |
BETROTHONRUSH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import contextlib | |
import cPickle as pickle | |
import gzip | |
import os | |
import random | |
import re | |
import sys | |
import nltk | |
def get_nsa_codewords(pickle_cache_file="nsa_codewords.pickle.gz", | |
token_min_length=3, token_max_length=7, | |
total_codewords=20): | |
"""Generate NSA-style code words. | |
This function requires NLTK [1] to be installed. | |
An NSA-style codeword is typically two singular nouns (NN), for | |
example "HOWLERMONKEY". Less frequently it is an adjective (JJ) | |
then a singular noun, for example "EGOTISTICALGIRAFFE" [2]. | |
Hence only two types of codeword are generated: | |
1. (NN, NN) | |
2. (JJ, NN) | |
The first time you run this function it will take a long time to | |
complete. However it will build a pickled cache file and re-use | |
that to make subsequent runs faster. | |
Using NLTK's default part-of-speech (POS) tagger, a Maximum | |
Entropy (maxent) tagger trained on the Penn Treebank corpus, | |
isn't ideal. This is because we're passing in single words as-is | |
without the context of a surrounding sentence, whereas he model | |
is trained on sentences. [3] [4] | |
param: pickle_cache_file: path to file to cache lists of | |
adjectives and nouns. | |
param: token_min_length: minimum (inclusive) length of a token | |
in a code word. | |
param: token_max_length: maximum (inclusive) length of a token | |
in a code word. | |
param: total_codewords: total number of codewords to generate. | |
References | |
[1] http://nltk.org/ | |
[2] NSA Codewords | |
http://cpunks.wordpress.com/2013/10/12/nsa-codewords | |
[3] A Maximum Entropy Model for Part-of-Speech Tagging | |
http://acl.ldc.upenn.edu/W/W96/W96-0213.pdf | |
[4] Learning to Classify Text, The NLTK Book | |
http://nltk.org/book/ch06.html | |
""" | |
if os.path.isfile(pickle_cache_file): | |
with contextlib.closing(gzip.open(pickle_cache_file, | |
"rb")) as f_in: | |
words = pickle.load(f_in) | |
nouns = words["nouns"] | |
adjectives = words["adjectives"] | |
else: | |
nouns = [] | |
adjectives = [] | |
with open("/usr/share/dict/words") as f_in: | |
lines = (line.strip() for line in f_in) | |
good_lines = (line for line in lines | |
if re.match("^[a-z]{%s,%s}$" % | |
(token_min_length, | |
token_max_length), line)) | |
for line in good_lines: | |
tag = nltk.pos_tag([line])[0][1] | |
if tag == "NN": | |
nouns.append(line) | |
elif tag == "JJ": | |
adjectives.append(line) | |
with contextlib.closing(gzip.open(pickle_cache_file, | |
"wb")) as f_out: | |
pickle.dump({"nouns": nouns, "adjectives": adjectives}, | |
f_out, -1) | |
for i in xrange(total_codewords): | |
if random.random() > 0.5: | |
output = [random.choice(nouns), random.choice(nouns)] | |
else: | |
output = [random.choice(adjectives), | |
random.choice(nouns)] | |
yield "%s" % "".join(output).upper() | |
if __name__ == "__main__": | |
for codeword in get_nsa_codewords(): | |
sys.stdout.write("%s\n" % codeword) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment