georgekinnear/anon.R

## anon.R
#
# Anonymise data by replacing UUN with a hashed unique ID
#
# Note - in order to prevent "rainbow table" hacking of the hashing, a salt string is added in the anonymise function
#        This string must not be published.
#


library(tidyverse)
library(digest) # see https://jangorecki.github.io/blog/2014-11-07/Data-Anonymization-in-R.html
library(data.table)

# use the contents of a .txt file as the secret salt for the hash function
fileName <- '_SECRET SALT.txt'
secretsalt <- readChar(fileName, file.info(fileName)$size)

# these columns will be removed from each dataframe, and replaced with a new AnonID column
cols_to_anonymise = c("UUN")

# Function that takes in a df and generates AnonID from cols_to_anon
# https://www.r-bloggers.com/anonymising-data/
anonymise <- function(data, cols_to_anon, algo = "crc32") {
  if(!require(digest)) stop("digest package is required")
  to_anon <- subset(data, select = cols_to_anon)
  to_anon$salt <- rep(secretsalt, nrow(to_anon)) # This should not be published, to preserve anonymity
  unname(apply(to_anon, 1, digest, algo = algo))
}

# Create a list of csv files to anonymise.
# In this case, we find all the .csv files which do not contain ANON in the filename,
# and only using files in the "rawdata-XXX" sub-directories
csv_files = getwd() %>%
  list.files(pattern = ".*.csv", recursive = TRUE, include.dirs = FALSE) %>%
  .[!str_detect(., "ANON")] %>%
  .[str_detect(., "rawdata-.*/")] %>%
  enframe()

# This will be used to store the AnonID lookup table
anon_key = NULL

# Loop over all the csv files and anonymise the cols_to_anon
for (filepath in csv_files$value) {

  print(filepath)
  data = read.csv(filepath)

  # Remove identifiable data that we don't need
  # (you should add/remove fields here as necessary)
  data = data %>%
    select(-contains("Surname")) %>%
    select(-contains("First.name")) %>%
    select(-contains("Forename")) %>%
    select(-contains("Institution")) %>%
    select(-contains("Department")) %>%
    select(-contains("Exam.Number")) %>%
    select(-contains("full.name"))

  # Data exported from Moodle includes an "Email address" field, which is of the form "UUN@...."
  # so here we extract just the UUN
  if("Email.address" %in% names(data)){
    data = data %>%
      mutate(UUN = toupper(substr(`Email.address`,1,8))) %>%
      select(UUN, everything(), -`Email.address`)
  }

  # Tidy up the column names from Moodle quiz data
  names(data) = str_replace(names(data), "Q\\.\\.", "Q")

  # Anonymise the UUN
  data = data %>%
    mutate(
      AnonID = paste0("S",anonymise(data, cols_to_anonymise))
    ) %>%
    select(AnonID, everything())

  # Add to the anon_key lookup table
  anon_key = bind_rows(anon_key, data %>% select(AnonID, cols_to_anonymise))

  # Remove the UUN from the data
  data = data %>% select(-UUN)

  # Show the results in the console so you can see at a glance if there are any unwanted columns still included
  print(head(data))

  # write out the anonymised data
  # TODO - you will want to customise the new file path and name
  filename = str_replace(filepath, "rawdata.*/", "")
  write.csv(data, paste0("data-ANON/ANON_",filename), row.names = FALSE, na = "")

}

# Save the AnonID lookup table
write.csv(anon_key %>% distinct(), "__ANON_KEY.csv", row.names = FALSE, na = "")


# Visual check that there is only one student for each AnonID
anon_key %>%
  distinct() %>%
  group_by(AnonID) %>%
  tally() %>%
  arrange(-n)

# Visual check that there is only one Anon ID for each student
anon_key %>%
  distinct() %>%
  group_by(UUN) %>%
  tally() %>%
  arrange(-n)
	#
	# Anonymise data by replacing UUN with a hashed unique ID
	#
	# Note - in order to prevent "rainbow table" hacking of the hashing, a salt string is added in the anonymise function
	# This string must not be published.
	#


	library(tidyverse)
	library(digest) # see https://jangorecki.github.io/blog/2014-11-07/Data-Anonymization-in-R.html
	library(data.table)

	# use the contents of a .txt file as the secret salt for the hash function
	fileName <- '_SECRET SALT.txt'
	secretsalt <- readChar(fileName, file.info(fileName)$size)

	# these columns will be removed from each dataframe, and replaced with a new AnonID column
	cols_to_anonymise = c("UUN")

	# Function that takes in a df and generates AnonID from cols_to_anon
	# https://www.r-bloggers.com/anonymising-data/
	anonymise <- function(data, cols_to_anon, algo = "crc32") {
	if(!require(digest)) stop("digest package is required")
	to_anon <- subset(data, select = cols_to_anon)
	to_anon$salt <- rep(secretsalt, nrow(to_anon)) # This should not be published, to preserve anonymity
	unname(apply(to_anon, 1, digest, algo = algo))
	}

	# Create a list of csv files to anonymise.
	# In this case, we find all the .csv files which do not contain ANON in the filename,
	# and only using files in the "rawdata-XXX" sub-directories
	csv_files = getwd() %>%
	list.files(pattern = ".*.csv", recursive = TRUE, include.dirs = FALSE) %>%
	.[!str_detect(., "ANON")] %>%
	.[str_detect(., "rawdata-.*/")] %>%
	enframe()

	# This will be used to store the AnonID lookup table
	anon_key = NULL

	# Loop over all the csv files and anonymise the cols_to_anon
	for (filepath in csv_files$value) {

	print(filepath)
	data = read.csv(filepath)

	# Remove identifiable data that we don't need
	# (you should add/remove fields here as necessary)
	data = data %>%
	select(-contains("Surname")) %>%
	select(-contains("First.name")) %>%
	select(-contains("Forename")) %>%
	select(-contains("Institution")) %>%
	select(-contains("Department")) %>%
	select(-contains("Exam.Number")) %>%
	select(-contains("full.name"))

	# Data exported from Moodle includes an "Email address" field, which is of the form "UUN@...."
	# so here we extract just the UUN
	if("Email.address" %in% names(data)){
	data = data %>%
	mutate(UUN = toupper(substr(`Email.address`,1,8))) %>%
	select(UUN, everything(), -`Email.address`)
	}

	# Tidy up the column names from Moodle quiz data
	names(data) = str_replace(names(data), "Q\\.\\.", "Q")

	# Anonymise the UUN
	data = data %>%
	mutate(
	AnonID = paste0("S",anonymise(data, cols_to_anonymise))
	) %>%
	select(AnonID, everything())

	# Add to the anon_key lookup table
	anon_key = bind_rows(anon_key, data %>% select(AnonID, cols_to_anonymise))

	# Remove the UUN from the data
	data = data %>% select(-UUN)

	# Show the results in the console so you can see at a glance if there are any unwanted columns still included
	print(head(data))

	# write out the anonymised data
	# TODO - you will want to customise the new file path and name
	filename = str_replace(filepath, "rawdata.*/", "")
	write.csv(data, paste0("data-ANON/ANON_",filename), row.names = FALSE, na = "")

	}

	# Save the AnonID lookup table
	write.csv(anon_key %>% distinct(), "__ANON_KEY.csv", row.names = FALSE, na = "")


	# Visual check that there is only one student for each AnonID
	anon_key %>%
	distinct() %>%
	group_by(AnonID) %>%
	tally() %>%
	arrange(-n)

	# Visual check that there is only one Anon ID for each student
	anon_key %>%
	distinct() %>%
	group_by(UUN) %>%
	tally() %>%
	arrange(-n)