Skip to content

Instantly share code, notes, and snippets.

@davidwtbuxton
Created April 3, 2012 12:39
Show Gist options
  • Save davidwtbuxton/2291658 to your computer and use it in GitHub Desktop.
Save davidwtbuxton/2291658 to your computer and use it in GitHub Desktop.
De-duplicate UKOOA data
# Python 2.7
from collections import OrderedDict
in_name = 'F87.p190'
out_name = 'my_results.txt'
results = OrderedDict()
# You should keep your input and output files separate, makes life much easier.
# The 'U' in 'rU' enables universal new-lines, which means it doesn't matter if
# your data uses Mac, Unix or Windows new-line conventions.
with open(in_name 'rU') as in_file:
for line in in_file:
number = line[1:12] # The range of the line number
x = line[25:35] # Range for the x co-ord
y = line[35:45] # Range for the y co-ord
key = (number,x,y) # Creates a tuple to use as a unique key
if key not in results: # Ignore line if we already recorded it
results[key]=line
# Now we have accumulated all the unique lines.
with open(out_name, 'w') as out_file:
for key in results:
# results[key] gets the value we stored (the whole line)
out_file.write(results[key])
# An alternate strategy, which ought to be a little faster and use less memory.
# Might be important if you have really big data. In this version we use a set
# to record the unique keys and open the input and output files at the same time.
# If the key is not in the set already then write
# the line to the out_file. Then record the key in the set (adding the same key
# to the set only ever makes one copy of the key).
with open(in_name, 'rU') as in_file, open(out_name, 'w') as out_file:
# Records unique keys as we find them
all_keys = set()
for line in in_file:
# Same unique key as with the other way, just shorter to type.
key = (line[1:12], line[25:35], line[35:45])
# Check if we had this key already. If not write line.
if key not in all_keys:
out_file.write(line)
# Now record the key
all_keys.add(key)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment