Skip to content

Instantly share code, notes, and snippets.

@dingodoppelt
Created April 4, 2023 12:41
Show Gist options
  • Save dingodoppelt/ddc47032d2980d5691f13b11750c8171 to your computer and use it in GitHub Desktop.
Save dingodoppelt/ddc47032d2980d5691f13b11750c8171 to your computer and use it in GitHub Desktop.
remove duplicates from csv file and export to csv and json simultaneously
import csv
import sys
import os.path
import json
def writeJSONFile(dataArray, csvFileOut):
with open(csvFileOut, 'w', encoding = 'utf-8') as json_file_handler:
json_file_handler.write(json.dumps(dataArray))
def removeDuplicates(inputCsvFile, outputCsvFile, outputJsonFile):
with open(inputCsvFile, 'r') as infile, open(outputCsvFile, 'a') as outfile:
# this list will hold unique postIds,
postIds = []
jsonArray = []
results = csv.DictReader(infile)
writer = csv.DictWriter(outfile, results.fieldnames)
writer.writeheader()
for result in results:
postId = result.get('post_id')
if postId in postIds:
continue
jsonArray.append(result)
writer.writerow(result)
postIds.append(postId)
writeJSONFile(jsonArray, outputJsonFile)
if os.path.isfile(sys.argv[1]):
csvFileIn = sys.argv[1]
csvFileOut = csvFileIn.split('.')[0] + '-nodup.csv'
jsonFileOut = csvFileIn.split('.')[0] + '-nodup.json'
if os.path.isfile(csvFileOut):
print('ERR: output file already exists!')
else:
removeDuplicates(csvFileIn, csvFileOut, jsonFileOut)
else:
print('ERR: input file not found!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment