Skip to content

Instantly share code, notes, and snippets.

@serif
Last active June 19, 2024 19:50
Show Gist options
  • Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.
Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.
Bitwarden Duplicate Entry Remover v2
#!/usr/bin/env python3
# updated 2023-11-27
# updated 2023-10-12
# updated 2021
# updated 2020
# created 2018
import sys
import hashlib
from urllib.parse import urlparse
def main(argv):
# Fields in Bitwarden CSV
f = 'folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp'.split(',')
if len(argv) < 1:
sys.exit('Supply input file path as command argument')
in_path = argv[0]
csv = '.csv'
csv_out = '_out' + csv
csv_rem = '_rem' + csv
out_path = in_path.replace(csv, csv_out)
rem_path = in_path.replace(csv, csv_rem)
completed_lines_hash = set()
line_number = -1
write_count = 0
cache = ''
# Process file
with open(out_path, 'w', encoding='utf8') as out_file, \
open(rem_path, 'w', encoding='utf8') as rem_file, \
open(in_path, 'r', encoding='utf8') as in_file:
for line in in_file:
line_number += 1
# Validate .csv format
if line_number == 0 and not line.strip() == ','.join(f):
print('\nBitwarden CSV format has changed.')
print('Contact author for update.')
exit(1)
# Skip empty lines
if not line.strip():
continue
fields = line.split(',')
# If the line has fewer fields than expected,
# try to combine with the previous line
if len(fields) < len(f):
# Add previous line if short
line = cache.strip('\n') + line
cache = line
fields = line.split(',')
if len(fields) == len(f):
print(f'Recovered with line {line_number}:\n{line}')
cache = ''
else:
print(f'Missing fields in line {line_number}:\n{line}')
rem_file.write(line)
continue
else:
cache = ''
# Generate an MD5 hash based on login URI, username, and password
if line_number != 0:
domain = urlparse(fields[f.index('login_uri')]).netloc
if len(domain) > 0:
fields[f.index('login_uri')] = domain
token = fields[f.index('login_uri')]
token += fields[f.index('login_username')]
token += fields[f.index('login_password')]
hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
# Write entry
if hashValue not in completed_lines_hash:
out_file.write(line)
completed_lines_hash.add(hashValue)
write_count += 1
else:
rem_file.write(line)
# print(f'Duplicate on line {line_number}:\n{line}')
# Report
dup_count = line_number - write_count
print(f'\nOutput file: {out_path}\n{write_count} unique entries saved')
print(f'\n{dup_count} duplicates saved to {rem_path}')
if __name__ == "__main__":
main(sys.argv[1:])
@lpr2rpl
Copy link

lpr2rpl commented Jan 30, 2024

Thank you for providing this code. Unfortunately, it fails with multiline entries created by latest bitwarden on Windows 10.

@oscarsiles
Copy link

I have managed to import a file with multi-line entries using some modifications (full code below). It also works for me when there are curly braces inside password fields. The only issue is multiple deleted "lines" are created in the _rem file for each iteration going through the multi-line portion.

#!/usr/bin/env python3
# updated 2024-02-07
# updated 2023-11-27
# updated 2023-10-12
# updated 2021
# updated 2020
# created 2018
import hashlib
import sys
from urllib.parse import urlparse

# With a little help from...
# https://stackoverflow.com/questions/29375614/how-to-get-csv-reader-to-ignore-commas-within-braces-curly-square-angle
l_braces = {"{"}
r_braces = {"}"}


def split(s):
    brace_count = 0
    quote_count = 0
    breaks = []

    for i, c in enumerate(s):
        if c == '"':
            quote_count += 1
            if quote_count % 2 == 1:
                brace_count += 1
            else:
                brace_count -= 1

        if c in l_braces:
            brace_count += 1

        if c in r_braces:
            brace_count -= 1

        if (c in [","]) and (brace_count == 0):
            breaks.append(i)

    pieces = []

    lag = 0
    for b in breaks:
        pieces.append(s[lag:b].strip())
        lag = b + 1

    try:
        pieces.append(s[breaks[-1] + 1 :].strip())
    except IndexError:
        pieces = s.split(",")

    return pieces


def main(argv):
    # Fields in Bitwarden CSV
    f = "folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp".split(",")

    if len(argv) < 1:
        sys.exit("Supply input file path as command argument")

    in_path = argv[0]
    csv = ".csv"
    csv_out = "_out" + csv
    csv_rem = "_rem" + csv
    out_path = in_path.replace(csv, csv_out)
    rem_path = in_path.replace(csv, csv_rem)
    completed_lines_hash = set()
    line_number = -1
    write_count = 0
    cache = ""

    # Process file
    with open(out_path, "w", encoding="utf8") as out_file, open(
        rem_path, "w", encoding="utf8"
    ) as rem_file, open(in_path, "r", encoding="utf8") as in_file:
        for line in in_file:
            line_number += 1

            # Validate .csv format
            if line_number == 0 and not line.strip() == ",".join(f):
                print("\nBitwarden CSV format has changed.")
                print("Contact author for update.")
                exit(1)

            # Skip empty lines
            if not line.strip():
                continue
            fields = split(line)

            # If the line has fewer fields than expected,
            # try to combine with the previous line
            if len(fields) < len(f):
                # Add previous line if short
                line = cache + line
                cache = line
                fields = split(line)
                if len(fields) == len(f):
                    print(f"Recovered with line {line_number}:\n{line}")
                    cache = ""
                else:
                    print(f"Missing fields in line {line_number}:\n{line}")
                    rem_file.write(line)
                    continue
            else:
                cache = ""

            # Generate an MD5 hash based on login URI, username, and password
            if line_number != 0:
                domain = urlparse(fields[f.index("login_uri")]).netloc
                if len(domain) > 0:
                    fields[f.index("login_uri")] = domain
            token = fields[f.index("login_uri")]
            token += fields[f.index("login_username")]
            token += fields[f.index("login_password")]
            if token == "":
                token = fields[f.index("notes")]
            hashValue = hashlib.md5(token.rstrip().encode("utf-8")).hexdigest()

            # Write entry
            if hashValue not in completed_lines_hash:
                out_file.write(line)
                completed_lines_hash.add(hashValue)
                write_count += 1
            else:
                rem_file.write(line)
                # print(f'Duplicate on line {line_number}:\n{line}')

    # Report
    dup_count = line_number - write_count
    print(f"\nOutput file: {out_path}\n{write_count} unique entries saved")
    print(f"\n{dup_count} duplicates saved to {rem_path}")


if __name__ == "__main__":
    main(sys.argv[1:])

@howird
Copy link

howird commented Jun 19, 2024

To anyone new looking for a script that can, in addition to removing duplicates, also helps you find and get rid of any old copies of passwords to websites (where you have the same username but old and new passwords) this script that i wrote can help

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment