serif/bwclean2.py

## bwclean2.py
#!/usr/bin/env python3
# updated 2023-11-27
# updated 2023-10-12
# updated 2021
# updated 2020
# created 2018
import sys
import hashlib
from urllib.parse import urlparse


def main(argv):

    # Fields in Bitwarden CSV
    f = 'folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp'.split(',')

    if len(argv) < 1:
        sys.exit('Supply input file path as command argument')

    in_path  = argv[0]
    csv = '.csv'
    csv_out = '_out' + csv
    csv_rem = '_rem' + csv
    out_path = in_path.replace(csv, csv_out)
    rem_path = in_path.replace(csv, csv_rem)
    completed_lines_hash = set()
    line_number   = -1
    write_count   = 0
    cache         = ''

    # Process file
    with open(out_path, 'w', encoding='utf8') as out_file, \
         open(rem_path, 'w', encoding='utf8') as rem_file, \
         open(in_path, 'r', encoding='utf8') as in_file:
        for line in in_file:
            line_number += 1

            # Validate .csv format
            if line_number == 0 and not line.strip() == ','.join(f):
                print('\nBitwarden CSV format has changed.')
                print('Contact author for update.')
                exit(1)

            # Skip empty lines
            if not line.strip():
                continue
            fields = line.split(',')

            # If the line has fewer fields than expected,
            # try to combine with the previous line
            if len(fields) < len(f):
                # Add previous line if short
                line = cache.strip('\n') + line
                cache = line
                fields = line.split(',')
                if len(fields) == len(f):
                    print(f'Recovered with line {line_number}:\n{line}')
                    cache = ''
                else:
                    print(f'Missing fields in line {line_number}:\n{line}')
                    rem_file.write(line)
                    continue
            else:
                cache = ''

            # Generate an MD5 hash based on login URI, username, and password
            if line_number != 0:
                domain = urlparse(fields[f.index('login_uri')]).netloc
                if len(domain) > 0:
                    fields[f.index('login_uri')] = domain
            token = fields[f.index('login_uri')]
            token += fields[f.index('login_username')]
            token += fields[f.index('login_password')]
            hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()

            # Write entry
            if hashValue not in completed_lines_hash:
                out_file.write(line)
                completed_lines_hash.add(hashValue)
                write_count += 1
            else:
                rem_file.write(line)
                # print(f'Duplicate on line {line_number}:\n{line}')

    # Report
    dup_count = line_number - write_count
    print(f'\nOutput file: {out_path}\n{write_count} unique entries saved')
    print(f'\n{dup_count} duplicates saved to {rem_path}')

if __name__ == "__main__":
   main(sys.argv[1:])
	#!/usr/bin/env python3
	# updated 2023-11-27
	# updated 2023-10-12
	# updated 2021
	# updated 2020
	# created 2018
	import sys
	import hashlib
	from urllib.parse import urlparse


	def main(argv):

	# Fields in Bitwarden CSV
	f = 'folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp'.split(',')

	if len(argv) < 1:
	sys.exit('Supply input file path as command argument')

	in_path = argv[0]
	csv = '.csv'
	csv_out = '_out' + csv
	csv_rem = '_rem' + csv
	out_path = in_path.replace(csv, csv_out)
	rem_path = in_path.replace(csv, csv_rem)
	completed_lines_hash = set()
	line_number = -1
	write_count = 0
	cache = ''

	# Process file
	with open(out_path, 'w', encoding='utf8') as out_file, \
	open(rem_path, 'w', encoding='utf8') as rem_file, \
	open(in_path, 'r', encoding='utf8') as in_file:
	for line in in_file:
	line_number += 1

	# Validate .csv format
	if line_number == 0 and not line.strip() == ','.join(f):
	print('\nBitwarden CSV format has changed.')
	print('Contact author for update.')
	exit(1)

	# Skip empty lines
	if not line.strip():
	continue
	fields = line.split(',')

	# If the line has fewer fields than expected,
	# try to combine with the previous line
	if len(fields) < len(f):
	# Add previous line if short
	line = cache.strip('\n') + line
	cache = line
	fields = line.split(',')
	if len(fields) == len(f):
	print(f'Recovered with line {line_number}:\n{line}')
	cache = ''
	else:
	print(f'Missing fields in line {line_number}:\n{line}')
	rem_file.write(line)
	continue
	else:
	cache = ''

	# Generate an MD5 hash based on login URI, username, and password
	if line_number != 0:
	domain = urlparse(fields[f.index('login_uri')]).netloc
	if len(domain) > 0:
	fields[f.index('login_uri')] = domain
	token = fields[f.index('login_uri')]
	token += fields[f.index('login_username')]
	token += fields[f.index('login_password')]
	hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()

	# Write entry
	if hashValue not in completed_lines_hash:
	out_file.write(line)
	completed_lines_hash.add(hashValue)
	write_count += 1
	else:
	rem_file.write(line)
	# print(f'Duplicate on line {line_number}:\n{line}')

	# Report
	dup_count = line_number - write_count
	print(f'\nOutput file: {out_path}\n{write_count} unique entries saved')
	print(f'\n{dup_count} duplicates saved to {rem_path}')

	if __name__ == "__main__":
	main(sys.argv[1:])