Last active
December 29, 2017 08:27
-
-
Save svengerlach/79fa9c880f0548e7ddd0296fc12f8230 to your computer and use it in GitHub Desktop.
Prepare Troy Hunt's >300 million password list for a binary search operation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# this script helps to prepare Troy Hunt's password list for a binary search operation | |
# | |
# see https://haveibeenpwned.com/Passwords for more informations | |
# | |
# place this file into your home directory, e.g. called prepare-pwned-passwords.sh and | |
# execute it using bash: | |
# | |
# nohup bash prepare-pwned-passwords.sh 1>prepare-pwned-passwords.log 2>prepare-pwned-passwords.err & | |
# | |
# the execution will take quite some time, that's why we run it using nohup. | |
# if the execution went well, you should find a file pwned-passwords-sorted.txt | |
# in directory ~/pwned-passwords | |
# | |
# I executed this script on a DigitalOcean Ubuntu 16.04 droplet located in Singapore (SGP1) | |
# The server had 4 GB of RAM, 60 GB of SSD disk space | |
# The execution took about 25 minutes | |
# | |
# Be sure to have at least 30 GB of disk space available! | |
# The extracted and combined txt files use about 13 GB of disk space. | |
# Throughout the execution, these files will be duplicated. Additionally, the sort command will produce some temporary files. | |
# | |
# The execution might be optimized a little further: | |
# - I didn't take a look at the various options of p7zip (extracting files may be optimized) | |
# - I didn't take a look at the various options of sort (sorting the lines may be optimized, | |
# specifying --buffer-size may be a goord starting point) | |
# | |
# Big THANKS to Troy Hunt and his efforts; I really think this password lists help to secure our users data | |
# we'd like to immediately stop the execution in case any command fails | |
set -o errexit | |
# just some debugging output, what time is it? ;-) | |
date | |
# we need to extract text files, Troy Hunt provided the pwned passwords as 7zip archives, | |
# so we need p7zip on our system to extract the archives | |
(dpkg -l | grep ' p7zip ') || (sudo apt-get update && sudo apt-get install -y p7zip) | |
# create a temporary directory in our home directory called pwned-passwords | |
mkdir -p ~/pwned-passwords/tmp | |
# go to our new directory | |
cd ~/pwned-passwords | |
# in case pwned-passwords-sorted.txt hasn't already been created | |
if [ ! -f pwned-passwords-sorted.txt ]; then | |
# check if pwned-passwords.txt isn't already present | |
# in case it is we don't need to re-download all the files (~5.6 GB in summary) | |
if [ ! -f pwned-passwords.txt ]; then | |
# remove pwned-passwords-1.0.txt.7z in case it's present | |
if [ -f pwned-passwords-1.0.txt.7z ]; then rm pwned-passwords-1.0.txt.7z; fi | |
# download pwned-passwords-1.0.txt.7z; that's the biggest archive (>300 million lines, ~5.3 GB) | |
wget https://downloads.pwnedpasswords.com/passwords/pwned-passwords-1.0.txt.7z | |
# remove file containing first update | |
if [ -f pwned-passwords-update-1.txt.7z ]; then rm pwned-passwords-update-1.txt.7z; fi | |
# download file containing first update | |
wget https://downloads.pwnedpasswords.com/passwords/pwned-passwords-update-1.txt.7z | |
# remove file containing second update | |
if [ -f pwned-passwords-update-2.txt.7z ]; then rm pwned-passwords-update-2.txt.7z; fi | |
# download file containing second update | |
wget https://downloads.pwnedpasswords.com/passwords/pwned-passwords-update-2.txt.7z | |
# extract all 7z archives located in the current directory | |
for f in *.7z; do | |
7zr e "${f}" | |
done | |
# remove all 7z archives located in the current directory | |
rm *7z | |
# we assume we now have three different files: | |
# - pwned-passwords-1.0.txt | |
# - pwned-passwords-update-1.txt | |
# - pwned-passwords-update-2.txt | |
# we need to combine these files in order to implement an efficient binary search | |
cat pwned-passwords-update-1.txt >> pwned-passwords-1.0.txt | |
cat pwned-passwords-update-2.txt >> pwned-passwords-1.0.txt | |
# rename combined file to pwned-passwords-crlf | |
# because, yes, ... the file contains windows line endings | |
mv pwned-passwords-1.0.txt pwned-passwords-crlf.txt | |
# remove the update files, we no longer need them | |
rm pwned-passwords-update-1.txt | |
rm pwned-passwords-update-2.txt | |
# convert to newline line endings, we don't want CRLF | |
tr -d '\15\32' < pwned-passwords-crlf.txt > pwned-passwords.txt | |
# remove pwned-passwords-crlf.txt, we don't need that anymore | |
rm pwned-passwords-crlf.txt | |
fi; | |
# a binary search can only work in an efficient manner, if the lines are sorted | |
# therefore we need to run the sort command | |
# sort needs a temporary directory (we use our current working directory for that, | |
# suffixed by 'tmp/') | |
# additionally, we tell sort to use a parallel execution | |
# (this is only useful if you run this script on a system with >2 CPUs) | |
# and we tell sort where to write our output to (pwned-passwords-sorted.txt) | |
sort \ | |
--temporary-directory=$(pwd)/tmp/ \ | |
--parallel=2 \ | |
--output=pwned-passwords-sorted.txt \ | |
pwned-passwords.txt | |
# we don't need our source file pwned-passwords.txt anymore, remove it! | |
rm pwned-passwords.txt | |
fi; | |
# and again, just some debugging output | |
date |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment