Skip to content

Instantly share code, notes, and snippets.

@serif
Last active June 24, 2024 06:01
Show Gist options
  • Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.
Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.
Bitwarden Duplicate Entry Remover v2
#!/usr/bin/env python3
# updated 2023-11-27
# updated 2023-10-12
# updated 2021
# updated 2020
# created 2018
import sys
import hashlib
from urllib.parse import urlparse
def main(argv):
# Fields in Bitwarden CSV
f = 'folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp'.split(',')
if len(argv) < 1:
sys.exit('Supply input file path as command argument')
in_path = argv[0]
csv = '.csv'
csv_out = '_out' + csv
csv_rem = '_rem' + csv
out_path = in_path.replace(csv, csv_out)
rem_path = in_path.replace(csv, csv_rem)
completed_lines_hash = set()
line_number = -1
write_count = 0
cache = ''
# Process file
with open(out_path, 'w', encoding='utf8') as out_file, \
open(rem_path, 'w', encoding='utf8') as rem_file, \
open(in_path, 'r', encoding='utf8') as in_file:
for line in in_file:
line_number += 1
# Validate .csv format
if line_number == 0 and not line.strip() == ','.join(f):
print('\nBitwarden CSV format has changed.')
print('Contact author for update.')
exit(1)
# Skip empty lines
if not line.strip():
continue
fields = line.split(',')
# If the line has fewer fields than expected,
# try to combine with the previous line
if len(fields) < len(f):
# Add previous line if short
line = cache.strip('\n') + line
cache = line
fields = line.split(',')
if len(fields) == len(f):
print(f'Recovered with line {line_number}:\n{line}')
cache = ''
else:
print(f'Missing fields in line {line_number}:\n{line}')
rem_file.write(line)
continue
else:
cache = ''
# Generate an MD5 hash based on login URI, username, and password
if line_number != 0:
domain = urlparse(fields[f.index('login_uri')]).netloc
if len(domain) > 0:
fields[f.index('login_uri')] = domain
token = fields[f.index('login_uri')]
token += fields[f.index('login_username')]
token += fields[f.index('login_password')]
hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
# Write entry
if hashValue not in completed_lines_hash:
out_file.write(line)
completed_lines_hash.add(hashValue)
write_count += 1
else:
rem_file.write(line)
# print(f'Duplicate on line {line_number}:\n{line}')
# Report
dup_count = line_number - write_count
print(f'\nOutput file: {out_path}\n{write_count} unique entries saved')
print(f'\n{dup_count} duplicates saved to {rem_path}')
if __name__ == "__main__":
main(sys.argv[1:])
@howird
Copy link

howird commented Jun 19, 2024

To anyone new looking for a script that can, in addition to removing duplicates, also helps you find and get rid of any old copies of passwords to websites (where you have the same username but old and new passwords) this script that i wrote can help

@topisani
Copy link

This uses the bitwarden json export, preserving more data, including properly preserving notes. It also keeps the newest entry if multiple exist. Consider merging this with @howird's logic above for the best combination
https://gist.github.com/topisani/066b63b87346afe76ffdf0998d4ebc2f

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment