Skip to content

Instantly share code, notes, and snippets.

@dramsay
Last active June 19, 2019 19:18
Show Gist options
  • Save dramsay/9091c6684d33c554543022f5f7beee4e to your computer and use it in GitHub Desktop.
Save dramsay/9091c6684d33c554543022f5f7beee4e to your computer and use it in GitHub Desktop.
# use by calling from command line and passing in path to file
# ruby anonymizer.rb /path/to/metrics_file.csv
require 'csv'
require 'securerandom'
HASHED_IP = %r|ip$|
IDENTIFIERS = %r[id$]
SECONDS = %r|seconds_downloaded|
def transform_ip(row, hashed_ip_index)
hashed_ip = row[hashed_ip_index]
index = rand(0..16)
hashed_ip[index] = (('a'..'z').to_a - [hashed_ip[index]]).sample
row[hashed_ip_index] = hashed_ip
row
end
def transform_seconds(row, seconds_index)
seconds = row[seconds_index]
row[seconds_index] = ((seconds.to_i + (-100..100).to_a.sample) * 1.3).round.to_s
row
end
# take headers plus first 100 rows
rows = CSV.foreach(ARGV[0]).take(101)
headers = rows.shift
hashed_ip_index = headers.index {|i| i =~ HASHED_IP }
seconds_index = headers.index {|i| i =~ SECONDS }
identifier_indexes = headers.select {|h| h =~ IDENTIFIERS }.map {|h| headers.index(h) }
data = CSV.generate do |csv|
csv << headers
rows.each do |row|
csv << row
.yield_self {|r| transform_ip(row, hashed_ip_index) }
.yield_self {|r| transform_seconds(row, seconds_index) }
.yield_self {|r| identifier_indexes.each {|i| r[i] = SecureRandom.uuid unless r[i].to_s.empty? }; r }
end
end
puts data
File.open('anonymized.csv', 'w') {|f| f.puts data }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment