svengerlach/prepare-pwned-passwords.sh

## prepare-pwned-passwords.sh
#!/bin/bash

# this script helps to prepare Troy Hunt's password list for a binary search operation
#
# see https://haveibeenpwned.com/Passwords for more informations
#
# place this file into your home directory, e.g. called prepare-pwned-passwords.sh and
# execute it using bash:
#
# nohup bash prepare-pwned-passwords.sh 1>prepare-pwned-passwords.log 2>prepare-pwned-passwords.err &
#
# the execution will take quite some time, that's why we run it using nohup.
# if the execution went well, you should find a file pwned-passwords-sorted.txt
# in directory ~/pwned-passwords
#
# I executed this script on a DigitalOcean Ubuntu 16.04 droplet located in Singapore (SGP1)
# The server had 4 GB of RAM, 60 GB of SSD disk space
# The execution took about 25 minutes
#
# Be sure to have at least 30 GB of disk space available!
# The extracted and combined txt files use about 13 GB of disk space.
# Throughout the execution, these files will be duplicated. Additionally, the sort command will produce some temporary files.
#
# The execution might be optimized a little further:
# - I didn't take a look at the various options of p7zip (extracting files may be optimized)
# - I didn't take a look at the various options of sort (sorting the lines may be optimized,
#   specifying --buffer-size may be a goord starting point)
#
# Big THANKS to Troy Hunt and his efforts; I really think this password lists help to secure our users data

# we'd like to immediately stop the execution in case any command fails
set -o errexit

# just some debugging output, what time is it? ;-)
date

# we need to extract text files, Troy Hunt provided the pwned passwords as 7zip archives,
# so we need p7zip on our system to extract the archives
(dpkg -l | grep ' p7zip ') || (sudo apt-get update && sudo apt-get install -y p7zip)

# create a temporary directory in our home directory called pwned-passwords
mkdir -p ~/pwned-passwords/tmp

# go to our new directory
cd ~/pwned-passwords

# in case pwned-passwords-sorted.txt hasn't already been created
if [ ! -f pwned-passwords-sorted.txt ]; then
    # check if pwned-passwords.txt isn't already present
    # in case it is we don't need to re-download all the files (~5.6 GB in summary)
    if [ ! -f pwned-passwords.txt ]; then
        # remove pwned-passwords-1.0.txt.7z in case it's present
        if [ -f pwned-passwords-1.0.txt.7z ]; then rm pwned-passwords-1.0.txt.7z; fi

        # download pwned-passwords-1.0.txt.7z; that's the biggest archive (>300 million lines, ~5.3 GB)
        wget https://downloads.pwnedpasswords.com/passwords/pwned-passwords-1.0.txt.7z

        # remove file containing first update
        if [ -f pwned-passwords-update-1.txt.7z ]; then rm pwned-passwords-update-1.txt.7z; fi

        # download file containing first update
        wget https://downloads.pwnedpasswords.com/passwords/pwned-passwords-update-1.txt.7z

        # remove file containing second update
        if [ -f pwned-passwords-update-2.txt.7z ]; then rm pwned-passwords-update-2.txt.7z; fi

        # download file containing second update
        wget https://downloads.pwnedpasswords.com/passwords/pwned-passwords-update-2.txt.7z

        # extract all 7z archives located in the current directory
        for f in *.7z; do
            7zr e "${f}"
        done

        # remove all 7z archives located in the current directory
        rm *7z

        # we assume we now have three different files:
        # - pwned-passwords-1.0.txt
        # - pwned-passwords-update-1.txt
        # - pwned-passwords-update-2.txt
        # we need to combine these files in order to implement an efficient binary search
        cat pwned-passwords-update-1.txt >> pwned-passwords-1.0.txt
        cat pwned-passwords-update-2.txt >> pwned-passwords-1.0.txt

        # rename combined file to pwned-passwords-crlf
        # because, yes, ... the file contains windows line endings
        mv pwned-passwords-1.0.txt pwned-passwords-crlf.txt

        # remove the update files, we no longer need them
        rm pwned-passwords-update-1.txt
        rm pwned-passwords-update-2.txt

        # convert to newline line endings, we don't want CRLF
        tr -d '\15\32' < pwned-passwords-crlf.txt > pwned-passwords.txt

        # remove pwned-passwords-crlf.txt, we don't need that anymore
        rm pwned-passwords-crlf.txt
    fi;

    # a binary search can only work in an efficient manner, if the lines are sorted
    # therefore we need to run the sort command
    # sort needs a temporary directory (we use our current working directory for that,
    # suffixed by 'tmp/')
    # additionally, we tell sort to use a parallel execution
    # (this is only useful if you run this script on a system with >2 CPUs)
    # and we tell sort where to write our output to (pwned-passwords-sorted.txt)
    sort \
        --temporary-directory=$(pwd)/tmp/ \
        --parallel=2 \
        --output=pwned-passwords-sorted.txt \
        pwned-passwords.txt

    # we don't need our source file pwned-passwords.txt anymore, remove it!
    rm pwned-passwords.txt
fi;

# and again, just some debugging output
date
	#!/bin/bash

	# this script helps to prepare Troy Hunt's password list for a binary search operation
	#
	# see https://haveibeenpwned.com/Passwords for more informations
	#
	# place this file into your home directory, e.g. called prepare-pwned-passwords.sh and
	# execute it using bash:
	#
	# nohup bash prepare-pwned-passwords.sh 1>prepare-pwned-passwords.log 2>prepare-pwned-passwords.err &
	#
	# the execution will take quite some time, that's why we run it using nohup.
	# if the execution went well, you should find a file pwned-passwords-sorted.txt
	# in directory ~/pwned-passwords
	#
	# I executed this script on a DigitalOcean Ubuntu 16.04 droplet located in Singapore (SGP1)
	# The server had 4 GB of RAM, 60 GB of SSD disk space
	# The execution took about 25 minutes
	#
	# Be sure to have at least 30 GB of disk space available!
	# The extracted and combined txt files use about 13 GB of disk space.
	# Throughout the execution, these files will be duplicated. Additionally, the sort command will produce some temporary files.
	#
	# The execution might be optimized a little further:
	# - I didn't take a look at the various options of p7zip (extracting files may be optimized)
	# - I didn't take a look at the various options of sort (sorting the lines may be optimized,
	# specifying --buffer-size may be a goord starting point)
	#
	# Big THANKS to Troy Hunt and his efforts; I really think this password lists help to secure our users data

	# we'd like to immediately stop the execution in case any command fails
	set -o errexit

	# just some debugging output, what time is it? ;-)
	date

	# we need to extract text files, Troy Hunt provided the pwned passwords as 7zip archives,
	# so we need p7zip on our system to extract the archives
	(dpkg -l \| grep ' p7zip ') \|\| (sudo apt-get update && sudo apt-get install -y p7zip)

	# create a temporary directory in our home directory called pwned-passwords
	mkdir -p ~/pwned-passwords/tmp

	# go to our new directory
	cd ~/pwned-passwords

	# in case pwned-passwords-sorted.txt hasn't already been created
	if [ ! -f pwned-passwords-sorted.txt ]; then
	# check if pwned-passwords.txt isn't already present
	# in case it is we don't need to re-download all the files (~5.6 GB in summary)
	if [ ! -f pwned-passwords.txt ]; then
	# remove pwned-passwords-1.0.txt.7z in case it's present
	if [ -f pwned-passwords-1.0.txt.7z ]; then rm pwned-passwords-1.0.txt.7z; fi

	# download pwned-passwords-1.0.txt.7z; that's the biggest archive (>300 million lines, ~5.3 GB)
	wget https://downloads.pwnedpasswords.com/passwords/pwned-passwords-1.0.txt.7z

	# remove file containing first update
	if [ -f pwned-passwords-update-1.txt.7z ]; then rm pwned-passwords-update-1.txt.7z; fi

	# download file containing first update
	wget https://downloads.pwnedpasswords.com/passwords/pwned-passwords-update-1.txt.7z

	# remove file containing second update
	if [ -f pwned-passwords-update-2.txt.7z ]; then rm pwned-passwords-update-2.txt.7z; fi

	# download file containing second update
	wget https://downloads.pwnedpasswords.com/passwords/pwned-passwords-update-2.txt.7z

	# extract all 7z archives located in the current directory
	for f in *.7z; do
	7zr e "${f}"
	done

	# remove all 7z archives located in the current directory
	rm *7z

	# we assume we now have three different files:
	# - pwned-passwords-1.0.txt
	# - pwned-passwords-update-1.txt
	# - pwned-passwords-update-2.txt
	# we need to combine these files in order to implement an efficient binary search
	cat pwned-passwords-update-1.txt >> pwned-passwords-1.0.txt
	cat pwned-passwords-update-2.txt >> pwned-passwords-1.0.txt

	# rename combined file to pwned-passwords-crlf
	# because, yes, ... the file contains windows line endings
	mv pwned-passwords-1.0.txt pwned-passwords-crlf.txt

	# remove the update files, we no longer need them
	rm pwned-passwords-update-1.txt
	rm pwned-passwords-update-2.txt

	# convert to newline line endings, we don't want CRLF
	tr -d '\15\32' < pwned-passwords-crlf.txt > pwned-passwords.txt

	# remove pwned-passwords-crlf.txt, we don't need that anymore
	rm pwned-passwords-crlf.txt
	fi;

	# a binary search can only work in an efficient manner, if the lines are sorted
	# therefore we need to run the sort command
	# sort needs a temporary directory (we use our current working directory for that,
	# suffixed by 'tmp/')
	# additionally, we tell sort to use a parallel execution
	# (this is only useful if you run this script on a system with >2 CPUs)
	# and we tell sort where to write our output to (pwned-passwords-sorted.txt)
	sort \
	--temporary-directory=$(pwd)/tmp/ \
	--parallel=2 \
	--output=pwned-passwords-sorted.txt \
	pwned-passwords.txt

	# we don't need our source file pwned-passwords.txt anymore, remove it!
	rm pwned-passwords.txt
	fi;

	# and again, just some debugging output
	date