Simple R script to anonymise data, by reading in multiple CSV files and replacing ID fields with a hashed version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Anonymise data by replacing UUN with a hashed unique ID | |
# | |
# Note - in order to prevent "rainbow table" hacking of the hashing, a salt string is added in the anonymise function | |
# This string must not be published. | |
# | |
library(tidyverse) | |
library(digest) # see https://jangorecki.github.io/blog/2014-11-07/Data-Anonymization-in-R.html | |
library(data.table) | |
# use the contents of a .txt file as the secret salt for the hash function | |
fileName <- '_SECRET SALT.txt' | |
secretsalt <- readChar(fileName, file.info(fileName)$size) | |
# these columns will be removed from each dataframe, and replaced with a new AnonID column | |
cols_to_anonymise = c("UUN") | |
# Function that takes in a df and generates AnonID from cols_to_anon | |
# https://www.r-bloggers.com/anonymising-data/ | |
anonymise <- function(data, cols_to_anon, algo = "crc32") { | |
if(!require(digest)) stop("digest package is required") | |
to_anon <- subset(data, select = cols_to_anon) | |
to_anon$salt <- rep(secretsalt, nrow(to_anon)) # This should not be published, to preserve anonymity | |
unname(apply(to_anon, 1, digest, algo = algo)) | |
} | |
# Create a list of csv files to anonymise. | |
# In this case, we find all the .csv files which do not contain ANON in the filename, | |
# and only using files in the "rawdata-XXX" sub-directories | |
csv_files = getwd() %>% | |
list.files(pattern = ".*.csv", recursive = TRUE, include.dirs = FALSE) %>% | |
.[!str_detect(., "ANON")] %>% | |
.[str_detect(., "rawdata-.*/")] %>% | |
enframe() | |
# This will be used to store the AnonID lookup table | |
anon_key = NULL | |
# Loop over all the csv files and anonymise the cols_to_anon | |
for (filepath in csv_files$value) { | |
print(filepath) | |
data = read.csv(filepath) | |
# Remove identifiable data that we don't need | |
# (you should add/remove fields here as necessary) | |
data = data %>% | |
select(-contains("Surname")) %>% | |
select(-contains("First.name")) %>% | |
select(-contains("Forename")) %>% | |
select(-contains("Institution")) %>% | |
select(-contains("Department")) %>% | |
select(-contains("Exam.Number")) %>% | |
select(-contains("full.name")) | |
# Data exported from Moodle includes an "Email address" field, which is of the form "UUN@...." | |
# so here we extract just the UUN | |
if("Email.address" %in% names(data)){ | |
data = data %>% | |
mutate(UUN = toupper(substr(`Email.address`,1,8))) %>% | |
select(UUN, everything(), -`Email.address`) | |
} | |
# Tidy up the column names from Moodle quiz data | |
names(data) = str_replace(names(data), "Q\\.\\.", "Q") | |
# Anonymise the UUN | |
data = data %>% | |
mutate( | |
AnonID = paste0("S",anonymise(data, cols_to_anonymise)) | |
) %>% | |
select(AnonID, everything()) | |
# Add to the anon_key lookup table | |
anon_key = bind_rows(anon_key, data %>% select(AnonID, cols_to_anonymise)) | |
# Remove the UUN from the data | |
data = data %>% select(-UUN) | |
# Show the results in the console so you can see at a glance if there are any unwanted columns still included | |
print(head(data)) | |
# write out the anonymised data | |
# TODO - you will want to customise the new file path and name | |
filename = str_replace(filepath, "rawdata.*/", "") | |
write.csv(data, paste0("data-ANON/ANON_",filename), row.names = FALSE, na = "") | |
} | |
# Save the AnonID lookup table | |
write.csv(anon_key %>% distinct(), "__ANON_KEY.csv", row.names = FALSE, na = "") | |
# Visual check that there is only one student for each AnonID | |
anon_key %>% | |
distinct() %>% | |
group_by(AnonID) %>% | |
tally() %>% | |
arrange(-n) | |
# Visual check that there is only one Anon ID for each student | |
anon_key %>% | |
distinct() %>% | |
group_by(UUN) %>% | |
tally() %>% | |
arrange(-n) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment