Skip to content

Instantly share code, notes, and snippets.

@jonycgn
Last active September 5, 2020 17:25
Show Gist options
  • Save jonycgn/6d2fd7932ff41ce6eccd40690bb24115 to your computer and use it in GitHub Desktop.
Save jonycgn/6d2fd7932ff41ce6eccd40690bb24115 to your computer and use it in GitHub Desktop.
Better diceware lists for German
#!/usr/bin/python3
"""Better diceware lists for German
Diceware (http://world.std.com/~reinhold/diceware.html) is a great way to
generate random, yet memorizable passphrases.
This Python script is designed to read a "Grundformliste" (list of base forms)
of German words in the format published by the Institut für deutsche Sprache
(Institute for German Language). It parses the list, selects the best words, and
outputs a word list compatible with diceware.
The "best" words are defined as words that are at the same time short and
common, so that they are easy to remember but also easy to type. You can
trade off between shortness and commonality using the `--difficulty` parameter.
To make a list, you can download a published Grundformliste from the Institute's
web site (http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html) and
unzip it. By default, the script expects an input file in the working directory
named derewo-v-ww-bll-320000g-2012-12-31-1.0.txt and outputs to wordlist_de.txt.
"""
import argparse
import collections
import io
import math
import re
import sys
Word = collections.namedtuple("Word", "word, freq_class, score")
class Formatter(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter):
pass
def parse_words(file, weight, to_ascii):
"""Reads text file line-by-line and outputs a list of Word objects."""
# Regex that matches word and frequency class. Note: we only allow German
# letters (Latin letters, umlauts, and ß) plus hypens to make sure any
# potential user will know how to type the word on a German keyboard.
line_regex = re.compile(r"([a-zäöüß\-]+) (\d+)")
umlaut_table = str.maketrans({
"ä": "ae",
"ö": "oe",
"ü": "ue",
"ß": "ss",
})
alternatives_regex = re.compile(r"\(([a-zäöüß]+)[a-zäöüß,]*?\)")
words = {}
for line in file:
# First, bring everything to lower case. We don't want to distinguish
# between upper and lower case, because it can cause ambiguities when
# remembering the word.
line = line.lower()
# If ASCII conversion is requested, map according to umlaut_table.
if to_ascii:
line = line.translate(umlaut_table)
# Then, if the line contains several alternatives in parentheses, simply
# pick the first one. For example: "ein(e) -> eine", "welch(er,e,es) ->
# welcher".
line = alternatives_regex.sub(r"\1", line)
# Match at the beginning of the line.
match = line_regex.match(line)
if not match:
continue
word = match.group(1)
freq_class = int(match.group(2))
# If the spelling already exists, only update it if the new word is more
# common.
if word not in words or words[word].freq_class > freq_class:
words[word] = Word(
word,
freq_class,
weight * len(word) + (1 - weight) * freq_class,
)
return words.values()
def dice_rolls(index, rolls):
string = [None] * rolls
for roll in range(rolls - 1, -1, -1):
string[roll] = str(index % 6 + 1)
index //= 6
return "".join(string)
def main(args):
if args.count > 0:
count = args.count
else:
count = 6 ** args.rolls
with io.open(args.input, "r", encoding=args.encoding) as file:
words = parse_words(file, args.difficulty, args.ascii)
words = sorted(words, key=lambda w: w.score)
best = sorted(words[:count], key=lambda w: w.word)
print(
"Selected the best {} out of {} words. "
"Average length: {:0.3}, average frequency class: {:0.3}."
"".format(
count, len(words),
sum(len(w.word) for w in best) / count,
sum(w.freq_class for w in best) / count),
file=sys.stderr)
with io.open(args.output, "w", encoding="utf-8") as file:
if args.count > 0:
for word in best:
print(word.word, file=file)
else:
for i, word in enumerate(best):
print("{}\t{}".format(dice_rolls(i, args.rolls), word.word), file=file)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
formatter_class=Formatter,
description=__doc__)
parser.add_argument(
"--input", "-i", type=str,
default="derewo-v-ww-bll-320000g-2012-12-31-1.0.txt",
help="Input filename.")
parser.add_argument(
"--output", "-o", type=str,
default="wordlist_de.txt",
help="Output filename.")
parser.add_argument(
"--rolls", "-r", type=int, default=5,
help="Output diceware list for this number of dice rolls.")
parser.add_argument(
"--count", "-c", type=int, default=0,
help="Output plain word list with this number of words. If > 0, "
"overrides --rolls.")
parser.add_argument(
"--difficulty", "-d", type=float, default=.4,
help="Difficulty of the vocabulary in the range of 0 (uses the most "
"common words) to 1 (uses the shortest words).")
parser.add_argument(
"--ascii", "-a", action="store_true",
help="Convert ä, ö, ü, ß to ae, oe, ue, ss.")
parser.add_argument(
"--encoding", "-e", type=str, default="iso-8859-15",
help="Input file encoding. Output file will have utf-8 encoding.")
args = parser.parse_args()
if not (args.count > 0 or args.rolls > 0):
raise ValueError("Need --count or --rolls.")
if not 0 <= args.difficulty <= 1:
raise ValueError("--difficulty needs to be between 0 and 1, inclusive.")
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment