Skip to content

Instantly share code, notes, and snippets.

Last active January 26, 2019 22:30
Show Gist options
  • Save cmacrander/f88b82655c86b9ff8ad097fe26fc1df0 to your computer and use it in GitHub Desktop.
Save cmacrander/f88b82655c86b9ff8ad097fe26fc1df0 to your computer and use it in GitHub Desktop.
Functions for the deidentification tool.
var DEIDENTIFIER = (function () {
'use strict';
// ** Constants ** //
var separator = '\t', // what separates fields in a row
saltRegex = /^\S{20,40}$/, // 20-40 characters, no white space
tokenRegex = /^\S{3,}$/, // at least 3 characters, no white space
emailRegex = /^.+@.+$/, // @ with something before and after
tokenColumnName = 'plaintext_token',
tokenColumnRename = 'token',
maximumRows = Math.pow(10, 6),
panelRequiredColumns = ['link', 'link_expiration', 'token'],
trailingLinesWarning = " May also be caused by empty rows at the " +
"end of the data.";
// ** Initialization ** //
var alert = false, // true if an alert is active
alertMessages = {}, // messages to show to the user
tokenIndex, // which column holds the token
mapTextarea, // DOM node for "Email-Token Map"
listTextarea, // DOM node for "Token List"
mapRows, // map data rows (w/o column headers)
mapColumns, // map column names (header fields)
emailColumnIndex, // 0 or 1, if email column is 1st or 2nd
tokenColumnIndex, // 0 or 1, if token column is 1st or 2nd
knownTokens, // "Token List" data
mapIndex, // token -> email hash object
cohortIndex = {}; // index of cohort codes and anon links
// ** Shared functions ** //
// The native .test() method allows non-string values to be coerced to
// strings (e.g. undefined becomes "undefined") and THEN compared to the
// regex. That's dumb. If it's not a string, it shouldn't match the regex,
// period. This accomplishes that.
RegExp.prototype.testString = function (value) {
if (typeof value === 'string') {
return this.test(value);
} else {
return false;
var UserInputError = function (message) {
this.message = message;
UserInputError.prototype = new Error();
UserInputError.prototype.constructor = Error;
var displayAlert = function (type, message) {
// 'invalid' is just like 'danger' except it also throws an error.
var shouldThrow = false;
if (type === 'invalid') {
type = 'danger';
shouldThrow = true;
alert = true;
util.initProp(alertMessages, type, []);
var alertMarkup = '';
forEach(alertMessages, function (type, messageList) {
var typeMarkup = '<div role="alert" class="alert alert-' + type + '"><ul>';
forEach(messageList, function (message) {
typeMarkup += '<li>' + message + '</li>';
typeMarkup += '</ul></div>';
alertMarkup += typeMarkup;
if (shouldThrow) {
throw new UserInputError(message);
var clearAlerts = function () {
alert = false;
alertMessages = {};
var disableButton = function (id) {
$('#' + id).prop('disabled', true);
var enableButton = function (id) {
$('#' + id).prop('disabled', false);
// # Notes on stripping and hashing.
// Why is it stripped exactly this way? We want to eliminate as many
// characters as possible (to eliminate possibilities that people may
// type them differently from time to time) which still preserving
// uniqueness. Lowercasing assumes that, e.g., 'JohnDoe' and
// 'johndoe' are always the same person. Dropping everything but
// letters, numbers, and periods has a similar function. The period is
// included b/c it is a commonly allowed and commonly used separator that
// may make the difference between, e.g., 'a.jay' and 'aja.y'.
// Stripping is idempotent.
// ## Notes on how universities build such ids:
// * Dartmouth:
// - Full name with periods between, e.g. John.A.Doe
// - When there is more than one person with the same name, an additional
// number is added, e.g. John.A.Doe-2
// * Cornell:
// - initials followed by one or more numbers.
// * Indiana & Bowling Green
// - Just letters
var stripToken = function (token) {
return token.toLowerCase().replace(/[^a-z0-9.]/g, '');
var hashToken = function (token, salt) {
return CryptoJS.SHA256(stripToken(token) + salt);
// Break the list up into equal-sized chunks, applying f() to each item of
// the list, writing a %-complete value to the progress span after each
// chunk. Then execute a callback with the resulting data.
var chunkedLoop = function (list, f, progressSpan, callback) {
var numChunks = 10,
chunkSize = Math.round(list.length / numChunks),
chunkedList = [], // will be a list of lists
// Concatenated results of all chunks, passed to the callback.
resultList = [],
x, // just a loop variable
chunkIndex = 0; // tracks the current chunk across timeouts
// Splice of chunks of equal size, but allow the last one to be of an
// arbitrary size, in case numChunks doesn't divide the length of the
// list evenly.
for (x = 0; x < numChunks - 1; x += 1) {
chunkedList.push(list.splice(0, chunkSize));
// Schedule a series of timeouts, one for each chunk. The browser
// stops blocking for a moment between each chunk, allowing the screen
// to update. This is the only way to have progress values reported to
// the view mid-loop. If it was done in a single loop, execution would
// block all the way to the end, and the screen would only update once
// at 100%.
var chunkFunction = function () {
setTimeout(function () {
// Run f on the chunk.
var chunk = chunkedList[chunkIndex];
var r = forEach(chunk, f);
resultList = resultList.concat(r);
chunkIndex += 1;
// Update progress on the screen.
progressSpan.html(Math.round(chunkIndex / numChunks * 100));
// Schedule the next run, if this isn't the last chunk. If it
// is the last chunk, execute the callback with the results.
if (chunkIndex < chunkedList.length) {
} else if (callback instanceof Function) {, resultList);
// There's no reason to delay more than the minimum one
// millisecond, since the point is just to break up javascript's
// single-threaded blocking.
}, 1);
// ** Functions for Deidentifier only. ** //
// The bulk of the flops happen here.
// Accepts and returns the whole row as a string. Hashes data found in the
// field designated by tokenIndex (calculated earlier in deidentify()).
var hashRow = function (rowString) {
var rowArray = rowString.split(separator),
rawToken = rowArray[tokenIndex],
hashedToken = hashToken(rawToken, salt);
rowArray.splice(tokenIndex, 1, hashedToken);
return rowArray.join(separator);
var deidentify = function () {
// Read user data
salt = $('#saltInput').val();
var textarea = $('#dataTextarea'),
rows = textarea.val().split('\n'),
// Store column header row separately.
providedColumns = rows.splice(0, 1)[0].split(separator);
tokenIndex = providedColumns.indexOf(tokenColumnName);
// Validate user data.
if (!saltRegex.testString(salt)) {
// See regex definition above.
displayAlert('danger', "Invalid salt: " + salt);
if (rows.length > maximumRows) {
displayAlert('danger', "Exceeded row limit of 1,000,000.");
} else if (rows.length === 0) {
displayAlert('danger', "No data rows found.");
} else {
var sampleId = rows[0].split(separator)[tokenIndex];
if (!tokenRegex.testString(sampleId)) {
"Student id didn't match expected pattern: " + sampleId
forEach(rows, function (r, i) {
var fields = r.split(separator);
if (fields.length !== providedColumns.length) {
displayAlert('danger', "Malformed data. Row " + (i + 2) +
" has the wrong number of values." +
if (!fields[tokenIndex]) {
displayAlert('danger', "Student token missing on row " +
(i + 2) + ".");
var tokenFound = false;
forEach(providedColumns, function (colName) {
if (colName === tokenColumnName) {
if (!tokenFound) {
tokenFound = true;
} else {
"Token column included more than once.");
if (!providedColumns.contains(tokenColumnName)) {
"Invalid headers. Must be tab-separated and " +
"include the column '" + tokenColumnName + "'."
// If any alerts have been raised, don't process the data.
if (alert) {
// Iterate over rows in 10 chunks, applying the hashRow() function to
// each row, writing a %-complete value to the progress span after
// each chunk. Then execute a callback with the resulting data.
chunkedLoop(rows, hashRow, $('#progressSpan'), function (hashedRows) {
// Change the name of the student id column.
providedColumns[tokenIndex] = tokenColumnRename;
// Display result.
textarea.val(providedColumns.join(separator) + '\n' +
displayAlert('success', "Deidentification complete.");
// ** Functions for Email Selection only. ** //
var checkDataSize = function () {
var requiredColumns = ['email', 'token']; // must be present in header
if (mapRows.length > maximumRows || knownTokens > maximumRows) {
displayAlert('invalid', "Exceeded row limit of 1,000,000.");
if (!mapTextarea.val() || mapRows.length === 0) {
displayAlert('invalid', "No Email-Token Map found.");
if (!listTextarea.val()) {
displayAlert('invalid', "No Token List found.");
if (util.arrayEqual(mapColumns, requiredColumns)) {
emailColumnIndex = 0;
tokenColumnIndex = 1;
} else if (util.arrayEqual(mapColumns.sort(), requiredColumns)) {
tokenColumnIndex = 0;
emailColumnIndex = 1;
} else {
"Values in Email-Token Map must be " +
"tab-separated with columns " +
"'token' and 'email'.");
var checkMapRowFormat = function (token, email, rowIndex) {
if (!token || !email) {
"Data is missing in row " + (rowIndex + 2) + ".");
if (!tokenRegex.testString(token)) {
"Token in Email-Token Map has less than three " +
"characters: " + token);
if (!emailRegex.testString(email)) {
displayAlert('invalid', "Invalid email address: " + email);
var checkTokenFormat = function (token) {
if (!tokenRegex.testString(token)) {
displayAlert('invalid', "Invalid token: " + token +
". Each should have 3 or more characters and no spaces or " +
"other whitespace." + trailingLinesWarning);
return stripToken(token);
var checkEmailAndTokenUniqueness = function () {
if (mapTokens.length !== util.arrayUnique(mapTokens).length) {
displayAlert('invalid', "Duplicate tokens in Email-Token Map.");
if (mapEmails.length !== util.arrayUnique(mapEmails).length) {
displayAlert('invalid', "Duplicate emails in Email-Token Map.");
if (knownTokens.length !== util.arrayUnique(knownTokens).length) {
displayAlert('invalid', "Duplicate tokens in Token List.");
var indexEmailTokenMap = function () {
// Reset data that might be left over from previous runs.
mapIndex = {};
mapTokens = [];
mapEmails = [];
forEach(mapRows, function (row, rowIndex) {
if (alert) { return; }
var fields = row.split(separator),
token = fields[tokenColumnIndex],
email = fields[emailColumnIndex];
if (fields.length !== 2) {
displayAlert('danger', "Malformed data. Row " + (rowIndex + 2) +
" has the wrong number of values." +
token = checkTokenFormat(token);
checkMapRowFormat(token, email, rowIndex);
mapIndex[token] = email;
var selectEmails = function () {
// Read user data
mapTextarea = $('#mapTextarea');
listTextarea = $('#listTextarea');
mapRows = mapTextarea.val().split('\n');
mapColumns = mapRows.splice(0, 1)[0].split(separator);
knownTokens = listTextarea.val().split('\n');
// Each of these check functions may call displayAlert of type
// 'invalid', which also throws an Error. Catching the error and
// returning is a convenient way to stop the function and let the user
// try again.
try {
// Check that the data is present and can be processed.
// Check the tokens in the Token List look good, and strip them.
knownTokens = forEach(knownTokens, checkTokenFormat);
// Process everything into convenient data structures.
// Do a uniqueness check on everything. Better to alert the user
// that they've pasted bad data than silently give a problematic
// result.
} catch (e) {
// Don't continue processing. Let the user read alerts and
// re-submit.
if (e instanceof UserInputError) {
// Else rethrow to expose errors we don't expect.
throw e;
// Actually do the requested processing.
var emailsNotInList = forEach(mapIndex, function (token, email) {
if (!knownTokens.contains(token)) { return email; }
var tokensNotInMap = forEach(knownTokens, function (token) {
if (!mapIndex[token]) { return token; }
// Display result.
$('#resultPre').html(emailsNotInList.join("\n") || "No emails found.");
if (tokensNotInMap.length === knownTokens.length) {
"None of the listed tokens were found in the Email-" +
"Token Map. Please check your data sources.");
} else {
displayAlert('success', "Processing complete.");
// ** Functions for Panel Redirection Map ** //
var checkPanelFormat = function (rows, columns) {
// Checking .length < 2 here b/c ''.split('\t') is [''], not [].
if (rows.length === 0 || columns.length < 2) {
displayAlert('invalid', "No data found.");
forEach(panelRequiredColumns, function (col) {
if (!columns.contains(col)) {
displayAlert('invalid', "Invalid columns in csv. Must " +
"include '" + panelRequiredColumns.join("', '") + "'.");
forEach(rows, function (r, i) {
if (r.split(separator).length !== columns.length) {
displayAlert('invalid', "Malformed data. Row " + (i + 2) +
" has the wrong number of values." +
var openEditLinkMode = function () {
$('#updateLinkInput').prop('readonly', null)
var closeEditLinkMode = function () {
var cohortCode = $('#cohortCodeSelect').val();
$('#updateLinkInput').prop('readonly', true)
var createCohort = function () {
var cohortCode = $('#cohortCodeInput').val(),
anonymousLink = $('#anonymousLinkInput').val();
$.ajax('/panel_redirection_map', {
method: 'POST',
data: {
cohort_code: cohortCode,
anonymous_link: anonymousLink
success: function (data, status, xhr) {
if (data === 'True') {
displayAlert('success', "Created cohort. Refreshing...");
setTimeout(function () {
}, 2 * 1000);
} else if (data.contains('Duplicate entry')) {
displayAlert('danger', "Cohort '" + cohortCode +
"' already exists.");
} else {
displayAlert('danger', "Error during upload: " + data);
error: function (xhr, status, error) {
throw new Error(error);
var updateLink = function () {
var cohortCode = $('#cohortCodeSelect').val(),
anonymousLink = $('#updateLinkInput').val();
$.ajax('/panel_redirection_map', {
method: 'PUT',
data: {
cohort_code: cohortCode,
anonymous_link: anonymousLink
success: function (data, status, xhr) {
if (data === 'True') {
displayAlert('success', "Updated link. Refreshing...");
setTimeout(function () {
}, 2 * 1000);
} else {
displayAlert('danger', "Error during upload: " + data);
error: function (xhr, status, error) {
throw new Error(error);
var uploadPanel = function () {
var panelCsv = $('#panelTextarea').val(),
panelRows = panelCsv.split("\n"),
panelColumns = panelRows.splice(0, 1)[0].split(separator),
cohortCode = $('#cohortCodeSelect').val();
// Qualtrics uses annoying capitalization and whitespace.
panelColumns = forEach(panelColumns, function (col) {
return col.toLowerCase().replace(/ /g, '_');
try {
if (!cohortCode) {
displayAlert('invalid', "All fields required.");
checkPanelFormat(panelRows, panelColumns);
} catch (e) {
if (e instanceof UserInputError) { return; }
throw e;
// Simplify the data to just the columns we care about.
var p = []
var colIndices = forEach(panelRequiredColumns, function (col) {
return panelColumns.indexOf(col);
forEach(panelRows, function (rowString) {
var row = rowString.split('\t');
row = forEach(colIndices, function (i) { return row[i]; });
var simplePanel = panelRequiredColumns.join('\t') + '\n' +
$.ajax('/panel_redirection_map', {
method: 'POST',
data: {
cohort_code: cohortCode,
panel: simplePanel
success: function (data, status, xhr) {
if (data === 'True') {
displayAlert('success', "Upload successful.");
} else {
displayAlert('danger', "Error during upload. Please " +
"check data and try again.");
error: function (xhr, status, error) {
throw new Error(error);
// ** PageLoad Execution ** //
$(function () {
if (window.location.pathname === '/panel_redirection_map') {
// Index the cohort data for easy lookups. They're dumped to page as
// as JSON list of dictionaries.
forEach(window.cohorts, function (row) {
cohortIndex[row.cohort_code] = row.anonymous_link;
$('#cohortCodeSelect').change(function () {
var newLink = cohortIndex[$(this).val()];
$('#editLinkButton').click(function () {
if ($('#updateLinkInput').prop('readonly')) {
} else {
return {
deidentify: deidentify,
selectEmails: selectEmails,
createCohort: createCohort,
updateLink: updateLink,
uploadPanel: uploadPanel
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment