Last active
June 19, 2024 19:50
-
-
Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.
Bitwarden Duplicate Entry Remover v2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# updated 2023-11-27 | |
# updated 2023-10-12 | |
# updated 2021 | |
# updated 2020 | |
# created 2018 | |
import sys | |
import hashlib | |
from urllib.parse import urlparse | |
def main(argv): | |
# Fields in Bitwarden CSV | |
f = 'folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp'.split(',') | |
if len(argv) < 1: | |
sys.exit('Supply input file path as command argument') | |
in_path = argv[0] | |
csv = '.csv' | |
csv_out = '_out' + csv | |
csv_rem = '_rem' + csv | |
out_path = in_path.replace(csv, csv_out) | |
rem_path = in_path.replace(csv, csv_rem) | |
completed_lines_hash = set() | |
line_number = -1 | |
write_count = 0 | |
cache = '' | |
# Process file | |
with open(out_path, 'w', encoding='utf8') as out_file, \ | |
open(rem_path, 'w', encoding='utf8') as rem_file, \ | |
open(in_path, 'r', encoding='utf8') as in_file: | |
for line in in_file: | |
line_number += 1 | |
# Validate .csv format | |
if line_number == 0 and not line.strip() == ','.join(f): | |
print('\nBitwarden CSV format has changed.') | |
print('Contact author for update.') | |
exit(1) | |
# Skip empty lines | |
if not line.strip(): | |
continue | |
fields = line.split(',') | |
# If the line has fewer fields than expected, | |
# try to combine with the previous line | |
if len(fields) < len(f): | |
# Add previous line if short | |
line = cache.strip('\n') + line | |
cache = line | |
fields = line.split(',') | |
if len(fields) == len(f): | |
print(f'Recovered with line {line_number}:\n{line}') | |
cache = '' | |
else: | |
print(f'Missing fields in line {line_number}:\n{line}') | |
rem_file.write(line) | |
continue | |
else: | |
cache = '' | |
# Generate an MD5 hash based on login URI, username, and password | |
if line_number != 0: | |
domain = urlparse(fields[f.index('login_uri')]).netloc | |
if len(domain) > 0: | |
fields[f.index('login_uri')] = domain | |
token = fields[f.index('login_uri')] | |
token += fields[f.index('login_username')] | |
token += fields[f.index('login_password')] | |
hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest() | |
# Write entry | |
if hashValue not in completed_lines_hash: | |
out_file.write(line) | |
completed_lines_hash.add(hashValue) | |
write_count += 1 | |
else: | |
rem_file.write(line) | |
# print(f'Duplicate on line {line_number}:\n{line}') | |
# Report | |
dup_count = line_number - write_count | |
print(f'\nOutput file: {out_path}\n{write_count} unique entries saved') | |
print(f'\n{dup_count} duplicates saved to {rem_path}') | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
I have managed to import a file with multi-line entries using some modifications (full code below). It also works for me when there are curly braces inside password fields. The only issue is multiple deleted "lines" are created in the _rem file for each iteration going through the multi-line portion.
#!/usr/bin/env python3
# updated 2024-02-07
# updated 2023-11-27
# updated 2023-10-12
# updated 2021
# updated 2020
# created 2018
import hashlib
import sys
from urllib.parse import urlparse
# With a little help from...
# https://stackoverflow.com/questions/29375614/how-to-get-csv-reader-to-ignore-commas-within-braces-curly-square-angle
l_braces = {"{"}
r_braces = {"}"}
def split(s):
brace_count = 0
quote_count = 0
breaks = []
for i, c in enumerate(s):
if c == '"':
quote_count += 1
if quote_count % 2 == 1:
brace_count += 1
else:
brace_count -= 1
if c in l_braces:
brace_count += 1
if c in r_braces:
brace_count -= 1
if (c in [","]) and (brace_count == 0):
breaks.append(i)
pieces = []
lag = 0
for b in breaks:
pieces.append(s[lag:b].strip())
lag = b + 1
try:
pieces.append(s[breaks[-1] + 1 :].strip())
except IndexError:
pieces = s.split(",")
return pieces
def main(argv):
# Fields in Bitwarden CSV
f = "folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp".split(",")
if len(argv) < 1:
sys.exit("Supply input file path as command argument")
in_path = argv[0]
csv = ".csv"
csv_out = "_out" + csv
csv_rem = "_rem" + csv
out_path = in_path.replace(csv, csv_out)
rem_path = in_path.replace(csv, csv_rem)
completed_lines_hash = set()
line_number = -1
write_count = 0
cache = ""
# Process file
with open(out_path, "w", encoding="utf8") as out_file, open(
rem_path, "w", encoding="utf8"
) as rem_file, open(in_path, "r", encoding="utf8") as in_file:
for line in in_file:
line_number += 1
# Validate .csv format
if line_number == 0 and not line.strip() == ",".join(f):
print("\nBitwarden CSV format has changed.")
print("Contact author for update.")
exit(1)
# Skip empty lines
if not line.strip():
continue
fields = split(line)
# If the line has fewer fields than expected,
# try to combine with the previous line
if len(fields) < len(f):
# Add previous line if short
line = cache + line
cache = line
fields = split(line)
if len(fields) == len(f):
print(f"Recovered with line {line_number}:\n{line}")
cache = ""
else:
print(f"Missing fields in line {line_number}:\n{line}")
rem_file.write(line)
continue
else:
cache = ""
# Generate an MD5 hash based on login URI, username, and password
if line_number != 0:
domain = urlparse(fields[f.index("login_uri")]).netloc
if len(domain) > 0:
fields[f.index("login_uri")] = domain
token = fields[f.index("login_uri")]
token += fields[f.index("login_username")]
token += fields[f.index("login_password")]
if token == "":
token = fields[f.index("notes")]
hashValue = hashlib.md5(token.rstrip().encode("utf-8")).hexdigest()
# Write entry
if hashValue not in completed_lines_hash:
out_file.write(line)
completed_lines_hash.add(hashValue)
write_count += 1
else:
rem_file.write(line)
# print(f'Duplicate on line {line_number}:\n{line}')
# Report
dup_count = line_number - write_count
print(f"\nOutput file: {out_path}\n{write_count} unique entries saved")
print(f"\n{dup_count} duplicates saved to {rem_path}")
if __name__ == "__main__":
main(sys.argv[1:])
To anyone new looking for a script that can, in addition to removing duplicates, also helps you find and get rid of any old copies of passwords to websites (where you have the same username but old and new passwords) this script that i wrote can help
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for providing this code. Unfortunately, it fails with multiline entries created by latest bitwarden on Windows 10.