Skip to content

Instantly share code, notes, and snippets.

@serif
Last active June 24, 2024 06:01
Show Gist options
  • Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.
Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.
Bitwarden Duplicate Entry Remover v2
#!/usr/bin/env python3
# updated 2023-11-27
# updated 2023-10-12
# updated 2021
# updated 2020
# created 2018
import sys
import hashlib
from urllib.parse import urlparse
def main(argv):
# Fields in Bitwarden CSV
f = 'folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp'.split(',')
if len(argv) < 1:
sys.exit('Supply input file path as command argument')
in_path = argv[0]
csv = '.csv'
csv_out = '_out' + csv
csv_rem = '_rem' + csv
out_path = in_path.replace(csv, csv_out)
rem_path = in_path.replace(csv, csv_rem)
completed_lines_hash = set()
line_number = -1
write_count = 0
cache = ''
# Process file
with open(out_path, 'w', encoding='utf8') as out_file, \
open(rem_path, 'w', encoding='utf8') as rem_file, \
open(in_path, 'r', encoding='utf8') as in_file:
for line in in_file:
line_number += 1
# Validate .csv format
if line_number == 0 and not line.strip() == ','.join(f):
print('\nBitwarden CSV format has changed.')
print('Contact author for update.')
exit(1)
# Skip empty lines
if not line.strip():
continue
fields = line.split(',')
# If the line has fewer fields than expected,
# try to combine with the previous line
if len(fields) < len(f):
# Add previous line if short
line = cache.strip('\n') + line
cache = line
fields = line.split(',')
if len(fields) == len(f):
print(f'Recovered with line {line_number}:\n{line}')
cache = ''
else:
print(f'Missing fields in line {line_number}:\n{line}')
rem_file.write(line)
continue
else:
cache = ''
# Generate an MD5 hash based on login URI, username, and password
if line_number != 0:
domain = urlparse(fields[f.index('login_uri')]).netloc
if len(domain) > 0:
fields[f.index('login_uri')] = domain
token = fields[f.index('login_uri')]
token += fields[f.index('login_username')]
token += fields[f.index('login_password')]
hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
# Write entry
if hashValue not in completed_lines_hash:
out_file.write(line)
completed_lines_hash.add(hashValue)
write_count += 1
else:
rem_file.write(line)
# print(f'Duplicate on line {line_number}:\n{line}')
# Report
dup_count = line_number - write_count
print(f'\nOutput file: {out_path}\n{write_count} unique entries saved')
print(f'\n{dup_count} duplicates saved to {rem_path}')
if __name__ == "__main__":
main(sys.argv[1:])
@topisani
Copy link

This uses the bitwarden json export, preserving more data, including properly preserving notes. It also keeps the newest entry if multiple exist. Consider merging this with @howird's logic above for the best combination
https://gist.github.com/topisani/066b63b87346afe76ffdf0998d4ebc2f

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment