jonycgn/wuerfelware.py

## wuerfelware.py
#!/usr/bin/python3
"""Better diceware lists for German

Diceware (http://world.std.com/~reinhold/diceware.html) is a great way to
generate random, yet memorizable passphrases.

This Python script is designed to read a "Grundformliste" (list of base forms)
of German words in the format published by the Institut für deutsche Sprache
(Institute for German Language). It parses the list, selects the best words, and
outputs a word list compatible with diceware.

The "best" words are defined as words that are at the same time short and
common, so that they are easy to remember but also easy to type. You can
trade off between shortness and commonality using the `--difficulty` parameter.

To make a list, you can download a published Grundformliste from the Institute's
web site (http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html) and
unzip it. By default, the script expects an input file in the working directory
named derewo-v-ww-bll-320000g-2012-12-31-1.0.txt and outputs to wordlist_de.txt.
"""

import argparse
import collections
import io
import math
import re
import sys


Word = collections.namedtuple("Word", "word, freq_class, score")


class Formatter(argparse.ArgumentDefaultsHelpFormatter,
                argparse.RawDescriptionHelpFormatter):
  pass


def parse_words(file, weight, to_ascii):
  """Reads text file line-by-line and outputs a list of Word objects."""

  # Regex that matches word and frequency class. Note: we only allow German
  # letters (Latin letters, umlauts, and ß) plus hypens to make sure any
  # potential user will know how to type the word on a German keyboard.
  line_regex = re.compile(r"([a-zäöüß\-]+) (\d+)")

  umlaut_table = str.maketrans({
      "ä": "ae",
      "ö": "oe",
      "ü": "ue",
      "ß": "ss",
  })
  alternatives_regex = re.compile(r"\(([a-zäöüß]+)[a-zäöüß,]*?\)")

  words = {}
  for line in file:
    # First, bring everything to lower case. We don't want to distinguish
    # between upper and lower case, because it can cause ambiguities when
    # remembering the word.
    line = line.lower()
    # If ASCII conversion is requested, map according to umlaut_table.
    if to_ascii:
      line = line.translate(umlaut_table)
    # Then, if the line contains several alternatives in parentheses, simply
    # pick the first one. For example: "ein(e) -> eine", "welch(er,e,es) ->
    # welcher".
    line = alternatives_regex.sub(r"\1", line)
    # Match at the beginning of the line.
    match = line_regex.match(line)
    if not match:
      continue
    word = match.group(1)
    freq_class = int(match.group(2))
    # If the spelling already exists, only update it if the new word is more
    # common.
    if word not in words or words[word].freq_class > freq_class:
      words[word] = Word(
        word,
        freq_class,
        weight * len(word) + (1 - weight) * freq_class,
      )
  return words.values()


def dice_rolls(index, rolls):
  string = [None] * rolls
  for roll in range(rolls - 1, -1, -1):
    string[roll] = str(index % 6 + 1)
    index //= 6
  return "".join(string)


def main(args):
  if args.count > 0:
    count = args.count
  else:
    count = 6 ** args.rolls

  with io.open(args.input, "r", encoding=args.encoding) as file:
    words = parse_words(file, args.difficulty, args.ascii)

  words = sorted(words, key=lambda w: w.score)
  best = sorted(words[:count], key=lambda w: w.word)

  print(
      "Selected the best {} out of {} words. "
      "Average length: {:0.3}, average frequency class: {:0.3}."
      "".format(
          count, len(words),
          sum(len(w.word) for w in best) / count,
          sum(w.freq_class for w in best) / count),
      file=sys.stderr)

  with io.open(args.output, "w", encoding="utf-8") as file:
    if args.count > 0:
      for word in best:
        print(word.word, file=file)
    else:
      for i, word in enumerate(best):
        print("{}\t{}".format(dice_rolls(i, args.rolls), word.word), file=file)


if __name__ == "__main__":
  parser = argparse.ArgumentParser(
      formatter_class=Formatter,
      description=__doc__)

  parser.add_argument(
      "--input", "-i", type=str,
      default="derewo-v-ww-bll-320000g-2012-12-31-1.0.txt",
      help="Input filename.")
  parser.add_argument(
      "--output", "-o", type=str,
      default="wordlist_de.txt",
      help="Output filename.")
  parser.add_argument(
      "--rolls", "-r", type=int, default=5,
      help="Output diceware list for this number of dice rolls.")
  parser.add_argument(
      "--count", "-c", type=int, default=0,
      help="Output plain word list with this number of words. If > 0, "
           "overrides --rolls.")
  parser.add_argument(
      "--difficulty", "-d", type=float, default=.4,
      help="Difficulty of the vocabulary in the range of 0 (uses the most "
           "common words) to 1 (uses the shortest words).")
  parser.add_argument(
      "--ascii", "-a", action="store_true",
      help="Convert ä, ö, ü, ß to ae, oe, ue, ss.")
  parser.add_argument(
      "--encoding", "-e", type=str, default="iso-8859-15",
      help="Input file encoding. Output file will have utf-8 encoding.")

  args = parser.parse_args()
  if not (args.count > 0 or args.rolls > 0):
    raise ValueError("Need --count or --rolls.")
  if not 0 <= args.difficulty <= 1:
    raise ValueError("--difficulty needs to be between 0 and 1, inclusive.")
  main(args)
	#!/usr/bin/python3
	"""Better diceware lists for German

	Diceware (http://world.std.com/~reinhold/diceware.html) is a great way to
	generate random, yet memorizable passphrases.

	This Python script is designed to read a "Grundformliste" (list of base forms)
	of German words in the format published by the Institut für deutsche Sprache
	(Institute for German Language). It parses the list, selects the best words, and
	outputs a word list compatible with diceware.

	The "best" words are defined as words that are at the same time short and
	common, so that they are easy to remember but also easy to type. You can
	trade off between shortness and commonality using the `--difficulty` parameter.

	To make a list, you can download a published Grundformliste from the Institute's
	web site (http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html) and
	unzip it. By default, the script expects an input file in the working directory
	named derewo-v-ww-bll-320000g-2012-12-31-1.0.txt and outputs to wordlist_de.txt.
	"""

	import argparse
	import collections
	import io
	import math
	import re
	import sys


	Word = collections.namedtuple("Word", "word, freq_class, score")


	class Formatter(argparse.ArgumentDefaultsHelpFormatter,
	argparse.RawDescriptionHelpFormatter):
	pass


	def parse_words(file, weight, to_ascii):
	"""Reads text file line-by-line and outputs a list of Word objects."""

	# Regex that matches word and frequency class. Note: we only allow German
	# letters (Latin letters, umlauts, and ß) plus hypens to make sure any
	# potential user will know how to type the word on a German keyboard.
	line_regex = re.compile(r"([a-zäöüß\-]+) (\d+)")

	umlaut_table = str.maketrans({
	"ä": "ae",
	"ö": "oe",
	"ü": "ue",
	"ß": "ss",
	})
	alternatives_regex = re.compile(r"\(([a-zäöüß]+)[a-zäöüß,]*?\)")

	words = {}
	for line in file:
	# First, bring everything to lower case. We don't want to distinguish
	# between upper and lower case, because it can cause ambiguities when
	# remembering the word.
	line = line.lower()
	# If ASCII conversion is requested, map according to umlaut_table.
	if to_ascii:
	line = line.translate(umlaut_table)
	# Then, if the line contains several alternatives in parentheses, simply
	# pick the first one. For example: "ein(e) -> eine", "welch(er,e,es) ->
	# welcher".
	line = alternatives_regex.sub(r"\1", line)
	# Match at the beginning of the line.
	match = line_regex.match(line)
	if not match:
	continue
	word = match.group(1)
	freq_class = int(match.group(2))
	# If the spelling already exists, only update it if the new word is more
	# common.
	if word not in words or words[word].freq_class > freq_class:
	words[word] = Word(
	word,
	freq_class,
	weight * len(word) + (1 - weight) * freq_class,
	)
	return words.values()


	def dice_rolls(index, rolls):
	string = [None] * rolls
	for roll in range(rolls - 1, -1, -1):
	string[roll] = str(index % 6 + 1)
	index //= 6
	return "".join(string)


	def main(args):
	if args.count > 0:
	count = args.count
	else:
	count = 6 ** args.rolls

	with io.open(args.input, "r", encoding=args.encoding) as file:
	words = parse_words(file, args.difficulty, args.ascii)

	words = sorted(words, key=lambda w: w.score)
	best = sorted(words[:count], key=lambda w: w.word)

	print(
	"Selected the best {} out of {} words. "
	"Average length: {:0.3}, average frequency class: {:0.3}."
	"".format(
	count, len(words),
	sum(len(w.word) for w in best) / count,
	sum(w.freq_class for w in best) / count),
	file=sys.stderr)

	with io.open(args.output, "w", encoding="utf-8") as file:
	if args.count > 0:
	for word in best:
	print(word.word, file=file)
	else:
	for i, word in enumerate(best):
	print("{}\t{}".format(dice_rolls(i, args.rolls), word.word), file=file)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	formatter_class=Formatter,
	description=__doc__)

	parser.add_argument(
	"--input", "-i", type=str,
	default="derewo-v-ww-bll-320000g-2012-12-31-1.0.txt",
	help="Input filename.")
	parser.add_argument(
	"--output", "-o", type=str,
	default="wordlist_de.txt",
	help="Output filename.")
	parser.add_argument(
	"--rolls", "-r", type=int, default=5,
	help="Output diceware list for this number of dice rolls.")
	parser.add_argument(
	"--count", "-c", type=int, default=0,
	help="Output plain word list with this number of words. If > 0, "
	"overrides --rolls.")
	parser.add_argument(
	"--difficulty", "-d", type=float, default=.4,
	help="Difficulty of the vocabulary in the range of 0 (uses the most "
	"common words) to 1 (uses the shortest words).")
	parser.add_argument(
	"--ascii", "-a", action="store_true",
	help="Convert ä, ö, ü, ß to ae, oe, ue, ss.")
	parser.add_argument(
	"--encoding", "-e", type=str, default="iso-8859-15",
	help="Input file encoding. Output file will have utf-8 encoding.")

	args = parser.parse_args()
	if not (args.count > 0 or args.rolls > 0):
	raise ValueError("Need --count or --rolls.")
	if not 0 <= args.difficulty <= 1:
	raise ValueError("--difficulty needs to be between 0 and 1, inclusive.")
	main(args)