Asmageddon/passwordstrength.py

## passwordstrength.py
import random, math
import re

def log_bin(value):
	return math.log(value) / math.log(2)

def cumprod(values):
	result = 1
	for i in values:
		result *= i
	return result

def gen_password(charset, length):
	result = ""
	for i in range(length):
		to_add = random.choice(charset)
		if len(to_add) > 1: #If it's not a single char, capitalize it so the attacker cannot make assumption that it's all lowercase
			to_add = to_add.capitalize()
		result += to_add
	return result

def format_size(size):
	size *= 1.0 #Convert to floats
	unit = ""
	prefix_list = "KMGTPEZY" #Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta
	prefix = -1

	while size > 1024:
		size /= 1024
		prefix+= 1

	if prefix >= len(prefix_list):
		return "much"
	elif prefix == 0:
		return "%.2fB" % size
	else:
		return "%.2f%siB" % (size, prefix_list[prefix])

def num_separators(number):
	number = list( str( int(number) ) )
	result = ""
	separator = ','
	for index, char in enumerate(reversed(number)):
		result = char + result
		if not (index +1) % 3 and index != len(number):
			result = separator + result
	return result

def acronym(text): #From big letters found in it
	pattern = re.compile("[A-Z]") #Simply find big letters ^^
	return ''.join( re.findall(pattern, text) )

dictfile = open("/usr/share/dict/words", "r")

pattern = re.compile("^.*[^'][^s]$") #Remove words ending with 's

#Generate list of words:
word_list = [word[:-1] for word in dictfile.readlines() if re.match(pattern, word[:-1])] # [:-1] removes the last character of the line - \n

#Count them and their average length
word_count = len(word_list)
average_word_length = sum([len(word) for word in word_list]) / (1.0 * len(word_list))
word_entropy_per_char = log_bin(word_count) / average_word_length

hash_digest_size = 512 / 8 #For SHA512, in bytes

#We're only using ASCII letters and numbers, no symbols except for underscore - a common set of allowed characters for passwords
char_list = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789"
char_count = len(char_list)
char_entropy_per_char = log_bin(char_count)

print "%i words in our dictionary" % word_count
print "  Of average length %.2f" % average_word_length
print "  With entropy content of %.2f bits per character\n" % word_entropy_per_char

print "We're using %i characters" % char_count
print "  With entropy content of %.2f bits each" % char_entropy_per_char
print "  Containing %.2f%% the entropy content of single character in a random word\n" % (char_entropy_per_char / word_entropy_per_char * 100)

results = [] #In format (combinations, text, disk usage of all hashes)

#Let's do the math for characters first, from 4 to 22
for i in range(4, 22):
	combinations = char_count**i
	#Disk usage of hashes for all possibilities
	disk_usage = combinations * (hash_digest_size + i + 1) #+1 for a delimiter, presumably newline
	results += [(combinations,
				"%2i chars:      %.2e permutations, %s disk space" % (i, combinations, format_size(disk_usage) ),
				disk_usage
	)]

#Now for words, we assume that attacker has a complete dictionary available
for i in range(1,8):
	combinations = word_count**i
	disk_usage = combinations * (hash_digest_size + i*average_word_length + 1)
	results += [(combinations,
				"%i words(%2ich): %.2e permutations, %s disk space" % (i, math.ceil(i*average_word_length), combinations, format_size(disk_usage) ),
				disk_usage
	)]

#Disclaimer: I have no idea how much this would take on what hardware, so please adjust yourself
hashing_speed = 9001.0 * 1000**2 #9001 million hashes per second

print "With %s hashes per second: " % num_separators(hashing_speed)

marks = [60, 60, 24, 30, 12, 100, 137000000] #Minutes, hours, days, months, years, centuries, ages of universe

marks_text = ["seconds", "minutes", "hours", "days", "months", "years", "centuries", "ages of universe"]

mark = 0

print "\nExample passwords:"
print "  12 random characters:"
for i in range(0,5):
	print "    %s  %s" % (gen_password(char_list, 12), gen_password(char_list, 12))
print "  4 random words:"
for i in range(0,5):
	word = gen_password(word_list, 4)
	print "    %s: %s (%ich)" % (acronym(word), word, len(word))
print "  5 random words:"
for i in range(0,5):
	word = gen_password(word_list, 5)
	print "    %s: %s (%ich)" % (acronym(word), word, len(word))
print "  15 random characters:"
for i in range(0,5):
	print "    %s  %s" % (gen_password(char_list, 15), gen_password(char_list, 15))

print "==Starting from seconds=="

for i in sorted(results): #Sorted by amount of permutations
	#Change our unit when we go over it's limit
	while i[0] / hashing_speed > cumprod( marks[:mark+1] ) and mark <= len(marks)-1: #More than one mark may be passed in one step
		mark += 1
		print "==Going into %s==" % marks_text[mark]
	print "%s (%.2f %s to brute force)" % ( i[1], i[0] / hashing_speed / cumprod(marks[:mark]), marks_text[mark] )

print "Disk usage is amount of space to store all hashes"
print "Brute force assumes worst case, averages to half of the value"

print "\nNotes:"
print "  With disk usage, I am assuming no compression is used on words."
print "   Zipping my words file yielded decrease from 931 to 252 kilobytes"
print "   Compressing would probably decrease size by ~2/3 of that,"
print "   since hashes are pretty much incompressible"
print "  Compressing random characters makes no significant difference"
print "  I am not accounting for technological progress, but I assume"
print "   that by time it becomes easy enough to crack these,"
print "   most currently used algorithms will be irrelevant anyway"

print "\nVerdict:"
print "  While random characters contain way more entropy per character,"
print "   a set of words is much easier to remember"
num_increase = average_word_length * 4 * 10
print "  Inserting randomly a single number into 4 words increases"
print "   amount of permutations %i times (%i orders of magnitude)" % (num_increase, math.floor (math.log10(num_increase)) )
print "   %.2f%% of how much adding a single char to a random sequence does" % (num_increase * 100.0 / char_count)
print "As such, I declare that using a sequence of 5 or even 4 random words"
print " is a superior alternative to random characters,"
print " since it is much easier for human brain to remember sequences"
print " that have a meaning, and acronyms help further"
	import random, math
	import re

	def log_bin(value):
	return math.log(value) / math.log(2)

	def cumprod(values):
	result = 1
	for i in values:
	result *= i
	return result

	def gen_password(charset, length):
	result = ""
	for i in range(length):
	to_add = random.choice(charset)
	if len(to_add) > 1: #If it's not a single char, capitalize it so the attacker cannot make assumption that it's all lowercase
	to_add = to_add.capitalize()
	result += to_add
	return result

	def format_size(size):
	size *= 1.0 #Convert to floats
	unit = ""
	prefix_list = "KMGTPEZY" #Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta
	prefix = -1

	while size > 1024:
	size /= 1024
	prefix+= 1

	if prefix >= len(prefix_list):
	return "much"
	elif prefix == 0:
	return "%.2fB" % size
	else:
	return "%.2f%siB" % (size, prefix_list[prefix])

	def num_separators(number):
	number = list( str( int(number) ) )
	result = ""
	separator = ','
	for index, char in enumerate(reversed(number)):
	result = char + result
	if not (index +1) % 3 and index != len(number):
	result = separator + result
	return result

	def acronym(text): #From big letters found in it
	pattern = re.compile("[A-Z]") #Simply find big letters ^^
	return ''.join( re.findall(pattern, text) )

	dictfile = open("/usr/share/dict/words", "r")

	pattern = re.compile("^.*[^'][^s]$") #Remove words ending with 's

	#Generate list of words:
	word_list = [word[:-1] for word in dictfile.readlines() if re.match(pattern, word[:-1])] # [:-1] removes the last character of the line - \n

	#Count them and their average length
	word_count = len(word_list)
	average_word_length = sum([len(word) for word in word_list]) / (1.0 * len(word_list))
	word_entropy_per_char = log_bin(word_count) / average_word_length

	hash_digest_size = 512 / 8 #For SHA512, in bytes

	#We're only using ASCII letters and numbers, no symbols except for underscore - a common set of allowed characters for passwords
	char_list = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789"
	char_count = len(char_list)
	char_entropy_per_char = log_bin(char_count)

	print "%i words in our dictionary" % word_count
	print " Of average length %.2f" % average_word_length
	print " With entropy content of %.2f bits per character\n" % word_entropy_per_char

	print "We're using %i characters" % char_count
	print " With entropy content of %.2f bits each" % char_entropy_per_char
	print " Containing %.2f%% the entropy content of single character in a random word\n" % (char_entropy_per_char / word_entropy_per_char * 100)

	results = [] #In format (combinations, text, disk usage of all hashes)

	#Let's do the math for characters first, from 4 to 22
	for i in range(4, 22):
	combinations = char_count**i
	#Disk usage of hashes for all possibilities
	disk_usage = combinations * (hash_digest_size + i + 1) #+1 for a delimiter, presumably newline
	results += [(combinations,
	"%2i chars: %.2e permutations, %s disk space" % (i, combinations, format_size(disk_usage) ),
	disk_usage
	)]

	#Now for words, we assume that attacker has a complete dictionary available
	for i in range(1,8):
	combinations = word_count**i
	disk_usage = combinations * (hash_digest_size + i*average_word_length + 1)
	results += [(combinations,
	"%i words(%2ich): %.2e permutations, %s disk space" % (i, math.ceil(i*average_word_length), combinations, format_size(disk_usage) ),
	disk_usage
	)]

	#Disclaimer: I have no idea how much this would take on what hardware, so please adjust yourself
	hashing_speed = 9001.0 * 1000**2 #9001 million hashes per second

	print "With %s hashes per second: " % num_separators(hashing_speed)

	marks = [60, 60, 24, 30, 12, 100, 137000000] #Minutes, hours, days, months, years, centuries, ages of universe

	marks_text = ["seconds", "minutes", "hours", "days", "months", "years", "centuries", "ages of universe"]

	mark = 0

	print "\nExample passwords:"
	print " 12 random characters:"
	for i in range(0,5):
	print " %s %s" % (gen_password(char_list, 12), gen_password(char_list, 12))
	print " 4 random words:"
	for i in range(0,5):
	word = gen_password(word_list, 4)
	print " %s: %s (%ich)" % (acronym(word), word, len(word))
	print " 5 random words:"
	for i in range(0,5):
	word = gen_password(word_list, 5)
	print " %s: %s (%ich)" % (acronym(word), word, len(word))
	print " 15 random characters:"
	for i in range(0,5):
	print " %s %s" % (gen_password(char_list, 15), gen_password(char_list, 15))

	print "==Starting from seconds=="

	for i in sorted(results): #Sorted by amount of permutations
	#Change our unit when we go over it's limit
	while i[0] / hashing_speed > cumprod( marks[:mark+1] ) and mark <= len(marks)-1: #More than one mark may be passed in one step
	mark += 1
	print "==Going into %s==" % marks_text[mark]
	print "%s (%.2f %s to brute force)" % ( i[1], i[0] / hashing_speed / cumprod(marks[:mark]), marks_text[mark] )

	print "Disk usage is amount of space to store all hashes"
	print "Brute force assumes worst case, averages to half of the value"

	print "\nNotes:"
	print " With disk usage, I am assuming no compression is used on words."
	print " Zipping my words file yielded decrease from 931 to 252 kilobytes"
	print " Compressing would probably decrease size by ~2/3 of that,"
	print " since hashes are pretty much incompressible"
	print " Compressing random characters makes no significant difference"
	print " I am not accounting for technological progress, but I assume"
	print " that by time it becomes easy enough to crack these,"
	print " most currently used algorithms will be irrelevant anyway"

	print "\nVerdict:"
	print " While random characters contain way more entropy per character,"
	print " a set of words is much easier to remember"
	num_increase = average_word_length * 4 * 10
	print " Inserting randomly a single number into 4 words increases"
	print " amount of permutations %i times (%i orders of magnitude)" % (num_increase, math.floor (math.log10(num_increase)) )
	print " %.2f%% of how much adding a single char to a random sequence does" % (num_increase * 100.0 / char_count)
	print "As such, I declare that using a sequence of 5 or even 4 random words"
	print " is a superior alternative to random characters,"
	print " since it is much easier for human brain to remember sequences"
	print " that have a meaning, and acronyms help further"