n8henrie/search_bad_logins.py

## search_bad_logins.py
#!/usr/bin/env python3
"""
search_bad_logins.py :: Compares a LastPass export to your Bitwarden vault.

Python >=3.10, no third party Python libraries / dependencies.

Outputs BW logins that may have been compromised in the recent LastPass hack
based on matching domain and password.

It would probably make sense to cast an even wider net by using something like
[`xsv`](https://github.com/BurntSushi/xsv) to just search for a potentially
compormised password *anywhere* in your vault, but this should help point out
some "this definitely needs to be changed" logins.

Prior to running, you'll need to:
    1. Install the Bitwarden CLI
    2. Log in with `bw login`
    3. Export the session variable:
        - Copy and paste the line starting with `$ export BW_SESSION=`
        - You'll need to remove the leading `$`
        - I highly recommend you insert a leading space where the `$` was,
          which should keep this command out of your bash history
    4. Run this script from the same terminal you used for the steps 1-4; run
       `bw list items` to make sure your login is working first

Usage:
    ./search_bad_logins.py --yes-i-read-the-script /path/to/lastpass-export.csv
"""

import argparse
import csv
import io
import json
import os
import subprocess
import sys
import urllib.request
from collections import defaultdict
from functools import lru_cache
from urllib.parse import urlparse


def _cli(show_usage: bool = False) -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("lastpass_csv", type=argparse.FileType("r", 1))
    parser.add_argument(
        "--yes-i-read-the-script",
        action="store_true",
        help="""
            This tool is reading both your lastpass and bitwarden vault data.
            You *really* should read the source code before you run it. Will
            not run without this flag set.
            """,
    )
    parser.add_argument(
        "--strip-subdomains",
        action="store_true",
        help="""
            Try to match based on TLD alone. This option will require
            downloading the list of TLDs from its GitHub mirror. Doesn't cache
            the response (yet).
            """,
    )
    parser.add_argument(
        "--passwords-anywhere",
        action="store_true",
        help="""
            Report any Bitwarden item that has a matching password anywhere in
            Lastpass. Skips matching on URI or username, so doesn't output
            usernames, and the shown URI is for the relevant Bitwarden item.
            """,
    )
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        "--output-format",
        default="csv",
        choices=["csv", "json"],
        help="Output format",
    )
    group.add_argument(
        "--json",
        action="store_true",
        help="Shortcut for `--output-form=json`",
    )
    group.add_argument(
        "--csv",
        action="store_true",
        help="Shortcut for `--output-form=csv`",
    )

    if show_usage:
        return parser.parse_args(["--help"])

    args = parser.parse_args()
    return args


@lru_cache
def get_tlds() -> set[str]:
    """Return a set of TLDs from https://www.publicsuffix.org.

    Uses the GitHub mirror to save publicsuffix.org some bandwidth. Doesn't
    cache the response yet, will download once for each run so please be kind.
    """
    # url = "https://www.publicsuffix.org/list/public_suffix_list.dat"
    url = (
        "https://raw.githubusercontent.com/publicsuffix/list/master/"
        "public_suffix_list.dat"
    )
    with urllib.request.urlopen(url) as resp:
        if resp.status != 200:
            raise Exception("Unable to get the TLD list")
        raw_tlds = resp.read().decode("utf8")
    return {
        stripped_line
        for line in raw_tlds.splitlines()
        if all(((stripped_line := line.strip()), not line.startswith("//")))
    }


def strip_subdomains(uri: str) -> str:
    """Strip subdomains from the URI to broaden the match.

    If LastPass has `shop.foo.com` and bitwarden has `foo.com`, that should
    probably match.

    Best effort basis.

    >>> strip_subdomains("foo.bar.www.n8henrie.com")
    'n8henrie.com'
    >>> strip_subdomains("some.where.uk")
    'where.uk'
    >>> strip_subdomains("some.where.co.uk")
    'where.co.uk'
    """
    parts = uri.strip().split(".")
    domain = ""
    tlds = get_tlds()

    while True:
        try:
            chunk = parts.pop()
        except IndexError:
            return uri.strip()
        domain = f"{chunk}.{domain}".rstrip(".")
        if domain not in tlds:
            return domain


# bw list items |
#   jq '.[10].login.uris[0].uri, .[10].login.username, .[10].login.password'
def get_bw_items(strip_subs: bool = False) -> dict[str, dict[str, str]]:
    """Return mapping of bitwarden logins.

    Format: { domain: [(username, password)] }

    Initially used {url: {login: password}} created in a dict comprehension,
    but this should be fine and ensures that duplicate entries don't get
    overwritten.
    """
    cmd = subprocess.run(
        "bw list items --nointeraction".split(), capture_output=True
    )
    if cmd.returncode > 0:
        print(cmd.stderr.decode())
        cmd.check_returncode()

    bw_items_raw = cmd.stdout
    bw_items = defaultdict(list)

    for item in json.loads(bw_items_raw):
        if not (login := item.get("login")):
            continue
        if not (uris := login.get("uris")):
            continue
        for uri_raw_dict in uris:
            uri_raw = uri_raw_dict["uri"]
            if uri_raw is None:
                uri_raw = ""

            uri = urlparse(uri_raw).netloc
            if strip_subs:
                uri = strip_subdomains(uri)
            user, pw = login.get("username"), login.get("password")

            # Replace `None`s with ""
            bw_items[uri].append((user or "", pw or ""))

    return dict(bw_items)


def get_lp_items(
    filehandle, strip_subs: bool = False
) -> list[tuple[str, str, str]]:
    """Return list of lastpass logins.

    Format: [(uri, username, password)]
    """
    lp_items = []
    for row in csv.DictReader(filehandle):
        uri = urlparse(row["url"]).netloc
        if strip_subs:
            uri = strip_subdomains(uri)
        lp_items.append((uri, row["username"], row["password"]))
    filehandle.close()
    return lp_items


def _compare(
    lp_items,
    bw_items,
    passwords_anywhere: bool = False,
) -> dict[str, dict[str, dict[str, str]]]:

    matches = dict()

    if passwords_anywhere:
        lp_pws: set[str] = {pw for pw in lp_items}

        for _, _, lp_pw in lp_pws:
            for uri, login in bw_items.items():
                for bw_user, bw_pw in login:
                    if bw_pw == lp_pw and lp_pw.strip():
                        matches[uri] = {
                            "password": lp_pw,
                            "lp_user": None,
                            "bw_user": None,
                        }
        return matches

    for (uri, lp_user, lp_pw) in lp_items:
        if not (bw_logins := bw_items.get(uri)):
            continue
        for (bw_user, bw_pw) in bw_logins:
            if lp_pw == bw_pw:
                matches[uri] = {
                    "password": lp_pw,
                    "lp_user": lp_user,
                    "bw_user": bw_user,
                }

    return matches


def _output(
    matches,
    fmt: str,
) -> str:
    match fmt:
        case "json":
            return json.dumps(matches, sort_keys=True, indent=4)
        case "csv":
            output = io.StringIO()
            writer = csv.writer(output, lineterminator=os.linesep)

            headers = ["uri", "lp_user", "bw_user", "password"]
            writer.writerow(headers)
            for uri, match in matches.items():
                pw = match["password"]
                lp_user = match["lp_user"]
                bw_user = match["bw_user"]
                writer.writerow([uri, lp_user, bw_user, pw])
            return output.getvalue().strip()
        case other:
            raise ValueError(f"Unknown output format: {other}")


def main():
    """Get the bitwarden logins, read the lastpass csv, and compare them."""
    args = _cli()
    if not args.yes_i_read_the_script:
        _cli(show_usage=True)
        sys.exit(1)

    lp_items = get_lp_items(
        args.lastpass_csv, strip_subs=args.strip_subdomains
    )
    bw_items = get_bw_items(strip_subs=args.strip_subdomains)

    matches = _compare(
        lp_items=lp_items,
        bw_items=bw_items,
        passwords_anywhere=args.passwords_anywhere,
    )

    match (args.csv, args.json):
        case (True, _):
            fmt = "csv"
        case (_, True):
            fmt = "json"
        case _:
            fmt = args.output_format

    print(_output(matches, fmt=fmt))


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	search_bad_logins.py :: Compares a LastPass export to your Bitwarden vault.

	Python >=3.10, no third party Python libraries / dependencies.

	Outputs BW logins that may have been compromised in the recent LastPass hack
	based on matching domain and password.

	It would probably make sense to cast an even wider net by using something like
	[`xsv`](https://github.com/BurntSushi/xsv) to just search for a potentially
	compormised password anywhere in your vault, but this should help point out
	some "this definitely needs to be changed" logins.

	Prior to running, you'll need to:
	1. Install the Bitwarden CLI
	2. Log in with `bw login`
	3. Export the session variable:
	- Copy and paste the line starting with `$ export BW_SESSION=`
	- You'll need to remove the leading `$`
	- I highly recommend you insert a leading space where the `$` was,
	which should keep this command out of your bash history
	4. Run this script from the same terminal you used for the steps 1-4; run
	`bw list items` to make sure your login is working first

	Usage:
	./search_bad_logins.py --yes-i-read-the-script /path/to/lastpass-export.csv
	"""

	import argparse
	import csv
	import io
	import json
	import os
	import subprocess
	import sys
	import urllib.request
	from collections import defaultdict
	from functools import lru_cache
	from urllib.parse import urlparse


	def _cli(show_usage: bool = False) -> argparse.Namespace:
	parser = argparse.ArgumentParser()
	parser.add_argument("lastpass_csv", type=argparse.FileType("r", 1))
	parser.add_argument(
	"--yes-i-read-the-script",
	action="store_true",
	help="""
	This tool is reading both your lastpass and bitwarden vault data.
	You really should read the source code before you run it. Will
	not run without this flag set.
	""",
	)
	parser.add_argument(
	"--strip-subdomains",
	action="store_true",
	help="""
	Try to match based on TLD alone. This option will require
	downloading the list of TLDs from its GitHub mirror. Doesn't cache
	the response (yet).
	""",
	)
	parser.add_argument(
	"--passwords-anywhere",
	action="store_true",
	help="""
	Report any Bitwarden item that has a matching password anywhere in
	Lastpass. Skips matching on URI or username, so doesn't output
	usernames, and the shown URI is for the relevant Bitwarden item.
	""",
	)
	group = parser.add_mutually_exclusive_group()
	group.add_argument(
	"--output-format",
	default="csv",
	choices=["csv", "json"],
	help="Output format",
	)
	group.add_argument(
	"--json",
	action="store_true",
	help="Shortcut for `--output-form=json`",
	)
	group.add_argument(
	"--csv",
	action="store_true",
	help="Shortcut for `--output-form=csv`",
	)

	if show_usage:
	return parser.parse_args(["--help"])

	args = parser.parse_args()
	return args


	@lru_cache
	def get_tlds() -> set[str]:
	"""Return a set of TLDs from https://www.publicsuffix.org.

	Uses the GitHub mirror to save publicsuffix.org some bandwidth. Doesn't
	cache the response yet, will download once for each run so please be kind.
	"""
	# url = "https://www.publicsuffix.org/list/public_suffix_list.dat"
	url = (
	"https://raw.githubusercontent.com/publicsuffix/list/master/"
	"public_suffix_list.dat"
	)
	with urllib.request.urlopen(url) as resp:
	if resp.status != 200:
	raise Exception("Unable to get the TLD list")
	raw_tlds = resp.read().decode("utf8")
	return {
	stripped_line
	for line in raw_tlds.splitlines()
	if all(((stripped_line := line.strip()), not line.startswith("//")))
	}


	def strip_subdomains(uri: str) -> str:
	"""Strip subdomains from the URI to broaden the match.

	If LastPass has `shop.foo.com` and bitwarden has `foo.com`, that should
	probably match.

	Best effort basis.

	>>> strip_subdomains("foo.bar.www.n8henrie.com")
	'n8henrie.com'
	>>> strip_subdomains("some.where.uk")
	'where.uk'
	>>> strip_subdomains("some.where.co.uk")
	'where.co.uk'
	"""
	parts = uri.strip().split(".")
	domain = ""
	tlds = get_tlds()

	while True:
	try:
	chunk = parts.pop()
	except IndexError:
	return uri.strip()
	domain = f"{chunk}.{domain}".rstrip(".")
	if domain not in tlds:
	return domain


	# bw list items \|
	# jq '.[10].login.uris[0].uri, .[10].login.username, .[10].login.password'
	def get_bw_items(strip_subs: bool = False) -> dict[str, dict[str, str]]:
	"""Return mapping of bitwarden logins.

	Format: { domain: [(username, password)] }

	Initially used {url: {login: password}} created in a dict comprehension,
	but this should be fine and ensures that duplicate entries don't get
	overwritten.
	"""
	cmd = subprocess.run(
	"bw list items --nointeraction".split(), capture_output=True
	)
	if cmd.returncode > 0:
	print(cmd.stderr.decode())
	cmd.check_returncode()

	bw_items_raw = cmd.stdout
	bw_items = defaultdict(list)

	for item in json.loads(bw_items_raw):
	if not (login := item.get("login")):
	continue
	if not (uris := login.get("uris")):
	continue
	for uri_raw_dict in uris:
	uri_raw = uri_raw_dict["uri"]
	if uri_raw is None:
	uri_raw = ""

	uri = urlparse(uri_raw).netloc
	if strip_subs:
	uri = strip_subdomains(uri)
	user, pw = login.get("username"), login.get("password")

	# Replace `None`s with ""
	bw_items[uri].append((user or "", pw or ""))

	return dict(bw_items)


	def get_lp_items(
	filehandle, strip_subs: bool = False
	) -> list[tuple[str, str, str]]:
	"""Return list of lastpass logins.

	Format: [(uri, username, password)]
	"""
	lp_items = []
	for row in csv.DictReader(filehandle):
	uri = urlparse(row["url"]).netloc
	if strip_subs:
	uri = strip_subdomains(uri)
	lp_items.append((uri, row["username"], row["password"]))
	filehandle.close()
	return lp_items


	def _compare(
	lp_items,
	bw_items,
	passwords_anywhere: bool = False,
	) -> dict[str, dict[str, dict[str, str]]]:

	matches = dict()

	if passwords_anywhere:
	lp_pws: set[str] = {pw for pw in lp_items}

	for _, _, lp_pw in lp_pws:
	for uri, login in bw_items.items():
	for bw_user, bw_pw in login:
	if bw_pw == lp_pw and lp_pw.strip():
	matches[uri] = {
	"password": lp_pw,
	"lp_user": None,
	"bw_user": None,
	}
	return matches

	for (uri, lp_user, lp_pw) in lp_items:
	if not (bw_logins := bw_items.get(uri)):
	continue
	for (bw_user, bw_pw) in bw_logins:
	if lp_pw == bw_pw:
	matches[uri] = {
	"password": lp_pw,
	"lp_user": lp_user,
	"bw_user": bw_user,
	}

	return matches


	def _output(
	matches,
	fmt: str,
	) -> str:
	match fmt:
	case "json":
	return json.dumps(matches, sort_keys=True, indent=4)
	case "csv":
	output = io.StringIO()
	writer = csv.writer(output, lineterminator=os.linesep)

	headers = ["uri", "lp_user", "bw_user", "password"]
	writer.writerow(headers)
	for uri, match in matches.items():
	pw = match["password"]
	lp_user = match["lp_user"]
	bw_user = match["bw_user"]
	writer.writerow([uri, lp_user, bw_user, pw])
	return output.getvalue().strip()
	case other:
	raise ValueError(f"Unknown output format: {other}")


	def main():
	"""Get the bitwarden logins, read the lastpass csv, and compare them."""
	args = _cli()
	if not args.yes_i_read_the_script:
	_cli(show_usage=True)
	sys.exit(1)

	lp_items = get_lp_items(
	args.lastpass_csv, strip_subs=args.strip_subdomains
	)
	bw_items = get_bw_items(strip_subs=args.strip_subdomains)

	matches = _compare(
	lp_items=lp_items,
	bw_items=bw_items,
	passwords_anywhere=args.passwords_anywhere,
	)

	match (args.csv, args.json):
	case (True, _):
	fmt = "csv"
	case (_, True):
	fmt = "json"
	case _:
	fmt = args.output_format

	print(_output(matches, fmt=fmt))


	if __name__ == "__main__":
	main()