Skip to content

Instantly share code, notes, and snippets.

@melvinwevers
Created January 2, 2016 20:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save melvinwevers/442e61a70c7e2f675663 to your computer and use it in GitHub Desktop.
Save melvinwevers/442e61a70c7e2f675663 to your computer and use it in GitHub Desktop.
script to filter out double Telegraaf from Archive
import csv
import sys
import re
from datetime import datetime
csv.field_size_limit(sys.maxsize)
field_id = 'ddd:11'
dateranges = [("1951-01-01", "1951-12-31"),
("1962-07-01", "1962-12-31"),
("1963-01-01", "1963-09-30"),
("1965-07-01", "1965-07-30"),
("1965-10-01", "1965-10-31"),
("1966-04-01", "1966-11-30"),
("1969-01-01", "1989-12-31")
]
dateranges = list(map(lambda dr:
tuple(map(lambda x:
datetime.strptime(x, "%Y-%m-%d"), dr)),
dateranges))
def datefilter(x):
x = datetime.strptime(x, "%Y-%m-%d")
for r in dateranges:
if r[0] <= x and r[1] >= x:
return True
return False
output = []
with open('my_file.csv', 'r') as f:
reader = csv.reader(f, delimiter='\t', quotechar='"')
next(reader)
for row in reader:
if datefilter(row[4]):
var = re.search('\\b'+field_id, row[3])
if bool(var) == False:
output.append(row)
else:
output.append(row)
with open('output.csv', 'w') as outputfile:
writer = csv.writer(outputfile, delimiter='\t', quotechar='"')
writer.writerows(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment