Skip to content

Instantly share code, notes, and snippets.

@georgekinnear
Last active May 12, 2022 11:08
Show Gist options
  • Save georgekinnear/d24c8549ac5be9b99dd5bbd2bb514eac to your computer and use it in GitHub Desktop.
Save georgekinnear/d24c8549ac5be9b99dd5bbd2bb514eac to your computer and use it in GitHub Desktop.
Simple R script to anonymise data, by reading in multiple CSV files and replacing ID fields with a hashed version
#
# Anonymise data by replacing UUN with a hashed unique ID
#
# Note - in order to prevent "rainbow table" hacking of the hashing, a salt string is added in the anonymise function
# This string must not be published.
#
library(tidyverse)
library(digest) # see https://jangorecki.github.io/blog/2014-11-07/Data-Anonymization-in-R.html
library(data.table)
# use the contents of a .txt file as the secret salt for the hash function
fileName <- '_SECRET SALT.txt'
secretsalt <- readChar(fileName, file.info(fileName)$size)
# these columns will be removed from each dataframe, and replaced with a new AnonID column
cols_to_anonymise = c("UUN")
# Function that takes in a df and generates AnonID from cols_to_anon
# https://www.r-bloggers.com/anonymising-data/
anonymise <- function(data, cols_to_anon, algo = "crc32") {
if(!require(digest)) stop("digest package is required")
to_anon <- subset(data, select = cols_to_anon)
to_anon$salt <- rep(secretsalt, nrow(to_anon)) # This should not be published, to preserve anonymity
unname(apply(to_anon, 1, digest, algo = algo))
}
# Create a list of csv files to anonymise.
# In this case, we find all the .csv files which do not contain ANON in the filename,
# and only using files in the "rawdata-XXX" sub-directories
csv_files = getwd() %>%
list.files(pattern = ".*.csv", recursive = TRUE, include.dirs = FALSE) %>%
.[!str_detect(., "ANON")] %>%
.[str_detect(., "rawdata-.*/")] %>%
enframe()
# This will be used to store the AnonID lookup table
anon_key = NULL
# Loop over all the csv files and anonymise the cols_to_anon
for (filepath in csv_files$value) {
print(filepath)
data = read.csv(filepath)
# Remove identifiable data that we don't need
# (you should add/remove fields here as necessary)
data = data %>%
select(-contains("Surname")) %>%
select(-contains("First.name")) %>%
select(-contains("Forename")) %>%
select(-contains("Institution")) %>%
select(-contains("Department")) %>%
select(-contains("Exam.Number")) %>%
select(-contains("full.name"))
# Data exported from Moodle includes an "Email address" field, which is of the form "UUN@...."
# so here we extract just the UUN
if("Email.address" %in% names(data)){
data = data %>%
mutate(UUN = toupper(substr(`Email.address`,1,8))) %>%
select(UUN, everything(), -`Email.address`)
}
# Tidy up the column names from Moodle quiz data
names(data) = str_replace(names(data), "Q\\.\\.", "Q")
# Anonymise the UUN
data = data %>%
mutate(
AnonID = paste0("S",anonymise(data, cols_to_anonymise))
) %>%
select(AnonID, everything())
# Add to the anon_key lookup table
anon_key = bind_rows(anon_key, data %>% select(AnonID, cols_to_anonymise))
# Remove the UUN from the data
data = data %>% select(-UUN)
# Show the results in the console so you can see at a glance if there are any unwanted columns still included
print(head(data))
# write out the anonymised data
# TODO - you will want to customise the new file path and name
filename = str_replace(filepath, "rawdata.*/", "")
write.csv(data, paste0("data-ANON/ANON_",filename), row.names = FALSE, na = "")
}
# Save the AnonID lookup table
write.csv(anon_key %>% distinct(), "__ANON_KEY.csv", row.names = FALSE, na = "")
# Visual check that there is only one student for each AnonID
anon_key %>%
distinct() %>%
group_by(AnonID) %>%
tally() %>%
arrange(-n)
# Visual check that there is only one Anon ID for each student
anon_key %>%
distinct() %>%
group_by(UUN) %>%
tally() %>%
arrange(-n)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment