Skip to content

Instantly share code, notes, and snippets.

@chekunkov
Last active August 29, 2015 14:00
Show Gist options
  • Save chekunkov/11228203 to your computer and use it in GitHub Desktop.
Save chekunkov/11228203 to your computer and use it in GitHub Desktop.
from optparse import OptionParser
import os
import sys
import csv
import hashlib
# workaround for error:
# Field larger than field limit
csv.field_size_limit(sys.maxsize)
def find_duplicates():
usage = "Usage: %prog input.csv [options]"
parser = OptionParser(usage=usage)
parser.add_option("-f",
"--field",
dest="field",
metavar="url",
default="url",
help="fields to check")
(options, args) = parser.parse_args()
if len(args) != 1:
parser.error('you should pass filename as a single argument')
return
seen_urls = set()
filename = args[0]
# filename = 'input.csv'
out_filename = os.path.join(os.path.dirname(filename), 'duplicates.txt')
duplicates_count = 0
limit = None
with open(filename) as csv_file, open(out_filename, 'w') as out_file:
reader = csv.DictReader(csv_file)
count = 0
try:
for row in reader:
count += 1
url = row[options.field]
# print url
fp = hashlib.md5(url).hexdigest()
if fp in seen_urls:
# duplicate
duplicates_count += 1
out_file.write(url + os.linesep)
else:
seen_urls.add(fp)
if limit and count >= limit:
break
if count % 100000 == 0:
print 'Items processed: ', count
except:
print 'Problem occured processing line', count
raise
print 'Output file:', out_filename
print 'Duplicates found:', duplicates_count
if __name__ == '__main__':
find_duplicates()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment