kathawala/js_keccak.js

## js_keccak.js
// TODO: Finish this up, test speed vs GPU vs emscripten transpiled C->asm.js
// Currently looking at the keccak function, which is given as input:
//   input - in our case the "caveat emptor" string
//   inlen - length of input = 13
//      md - the buffer of 200 bytes we want to fill up
//   stlen - length of uint64_t array with 25 elements = 200 bytes
//           ( aka: output length we desire )

// Reading through the code there is:
//   a temp buffer of 144 bytes
//   a   st buffer of 200 bytes
//   rsiz  = 136
//   rsizw = 136 / 8 = 17

// rsiz and rsizw are "rate" and "rateInBytes" in the Keccak reference, but "rate"
// in Keccak reference is 1088, thus "rateInBytes" should be 136...?

// There is a loop which is sometimes skipped, but for now we will ignore it,
// it seems to move along the input and do some computations until it is only
// looking at input where input is lt rsiz (rsiz == 136)

// This loop seems only to be triggered when the input is gte rsiz (rsiz == 136)

// Afterwards we copy the input (or what's left of it) to the temp buffer
// Then we add a 1 to the end of what's been copied and pad the temp buffer up
// to rsiz size (i.e. until it is 136 bytes) with 0s
// Then we bitwise OR the last byte of rsiz (i.e. temp[rsiz-1]) with 0x80

// Then in a for loop for rsizw (rsizw == 17) rounds we do this
// in every iteration
//   for every 8 bytes of st,
//     we set those 8 bytes to bitwise XOR of those 8 bytes with 8 bytes from temp


// Now we call keccakf on st with the number of Keccak Rounds set at 24

// Then we memcpy st into md and return (voila! keccak hash!)

// the keccakf function is given the following inputs:
//   st - a buffer of 8 byte values with 25 values in it
//   rounds - number of rounds (in our case 24)

// we then initialize the following vars
//   t - an 8 byte value
//   bc - a 5 value array of 8 byte values

// Thus, for each round there are 4 steps. I will leave the reading of this code
// up to you, since it is pretty descriptive in and of itself. Only here I will
// note possible speed up points

// In the Theta step, the second loop has you calculating 5 t values and successively
// XORing them with different values in st. It would probably be faster to calculate
// the 5 t values all at once (in an unrolled sort of way) and then successively
// XOR all the values in st all at once. (play with some XORing to make sure)

// In the Rho Pi step, I think having an extra variable is more readable than
// bc[0] = st[j]
// st[j] = blah(t, ajksf)
// t = bc[0]

// In the Chi step you can definitely shave off some loops, just need to experiment
// with how in a scratch C program.

// In the Iota step you are lastly taking a constant and XORing it with st[0]

// Congrats! Now you can implement a keccak hash in JavaScript (although it seems
// to differ a bit from the reference keccak hash)

// New place to see a good Keccak reference (all optimizations are there):
//   https://github.com/lucasjones/cpuminer-multi/tree/master/crypto
//   (newer miner with ops) https://github.com/fireice-uk/xmr-stak-cpu/tree/master/crypto

// function keccak(input, inlen, md, mdlen) { //md, mdlen here not necessary. mdlen==200

function print64bit(hi, lo) {
  hi = hi.toString(2);
  lo = lo.toString(2);
  if (hi.length < 32) {
    var padding = 32 - hi.length;
    for (var i = 0; i < padding; i++) {
      hi = "0" + hi;
    }
  }
  if (lo.length < 32) {
    var padding = 32 - lo.length;
    for (var i = 0; i < padding; i++) {
      lo = "0" + lo;
    }
  }
  console.log(hi + " " + lo);
}

module.exports = {
  keccak: keccak
}

function keccak(input, inlen) {
  const KECCAK_ROUNDS = 24;
  const HASH_DATA_AREA = 136;
  var state_buffer = new ArrayBuffer(200);
  var st = new Uint32Array(state_buffer);

  // original code runs on 64-bit data, since JS can only handle 32
  // we divide by 4 (bytes) instead of 8 (bytes) as in original code
  var rsizw = HASH_DATA_AREA/4;

  // for loop which gets skipped on small inputs!!!
  var end = 0;
  console.log(inlen);
  var special_epochs = Math.floor(inlen / HASH_DATA_AREA);
  if (special_epochs > 0) {
    inlen = inlen % HASH_DATA_AREA;
    for (var j = 0; j < special_epochs; ++j) {
      var start = j*HASH_DATA_AREA;
      end = start+HASH_DATA_AREA;
      var temp = new Uint8Array(144);
      temp.set(input.subarray(start, start+HASH_DATA_AREA));
      var ttemp = new Uint32Array(temp.buffer);
      for (i=0; i<rsizw; ++i) {
	st[i] ^= ttemp[i];
      }
      keccakf(st, KECCAK_ROUNDS);
    }
  }

  var temp = new Uint8Array(144);
  temp.set(input.subarray(end,));
  temp[inlen++] = 1;
  temp[HASH_DATA_AREA-1] |= 0x80;
  var ttemp = new Uint32Array(temp.buffer);

  // var ttemp = new Uint32Array(temp.buffer);

  for (i=0; i<rsizw; ++i) {
    if (i%2 == 1) {
      print64bit(ttemp[i], ttemp[i-1]);
    }
    st[i] ^= ttemp[i];
  }

  keccakf(st, KECCAK_ROUNDS);

  return state_buffer;
}

function keccakf(st, rounds) {
  // const keccakf_piln = [10, 7,  11, 17, 18, 3, 5,  16, 8, 21, 24, 4,
			// 15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1 ];
  // const keccakf_rotc = [1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2, 14,
			// 27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44];
  const keccakf_rndc  = [0x00000000, 0x00000001, 0x00000000, 0x00008082, 0x80000000, 0x0000808a,
			 0x80000000, 0x80008000, 0x00000000, 0x0000808b, 0x00000000, 0x80000001,
			 0x80000000, 0x80008081, 0x80000000, 0x00008009, 0x00000000, 0x0000008a,
			 0x00000000, 0x00000088, 0x00000000, 0x80008009, 0x00000000, 0x8000000a,
			 0x00000000, 0x8000808b, 0x80000000, 0x0000008b, 0x80000000, 0x00008089,
			 0x80000000, 0x00008003, 0x80000000, 0x00008002, 0x80000000, 0x00000080,
			 0x00000000, 0x0000800a, 0x80000000, 0x8000000a, 0x80000000, 0x80008081,
			 0x80000000, 0x00008080, 0x00000000, 0x80000001, 0x80000000, 0x80008008];

  // remember that st is a 32 bit array, NOT 64 like in the orig code
  var t = new Uint32Array(2);
  var bc = new Uint32Array(10);

  for (var round=0; round<rounds; ++round) {

    // Theta
    bc[0] = st[0] ^ st[10] ^ st[20] ^ st[30] ^ st[40];
    bc[1] = st[1] ^ st[11] ^ st[21] ^ st[31] ^ st[41];
    bc[2] = st[2] ^ st[12] ^ st[22] ^ st[32] ^ st[42];
    bc[3] = st[3] ^ st[13] ^ st[23] ^ st[33] ^ st[43];
    bc[4] = st[4] ^ st[14] ^ st[24] ^ st[34] ^ st[44];
    bc[5] = st[5] ^ st[15] ^ st[25] ^ st[35] ^ st[45];
    bc[6] = st[6] ^ st[16] ^ st[26] ^ st[36] ^ st[46];
    bc[7] = st[7] ^ st[17] ^ st[27] ^ st[37] ^ st[47];
    bc[8] = st[8] ^ st[18] ^ st[28] ^ st[38] ^ st[48];
    bc[9] = st[9] ^ st[19] ^ st[29] ^ st[39] ^ st[49];

    for (var i=0; i<10; i+=2) { // must unroll loop as modulo 10 is expensive on GPU
      var hi = bc[(i+3) % 10];
      var lo = bc[(i+2) % 10];
      t[0] = bc[(i+8) % 10] ^ rotl64_lo(hi, lo, 1);
      t[1] = bc[(i+9) % 10] ^ rotl64_hi(hi, lo, 1);

      st[i]    ^= t[0];
      st[i+1]  ^= t[1];
      st[i+10] ^= t[0];
      st[i+11] ^= t[1];
      st[i+20] ^= t[0];
      st[i+21] ^= t[1];
      st[i+30] ^= t[0];
      st[i+31] ^= t[1];
      st[i+40] ^= t[0];
      st[i+41] ^= t[1];
    }

    // Rho Pi
    t[0] = st[2];
    t[1] = st[3];

    st[ 2] = rotl64_lo(st[13], st[12], 44);
    st[ 3] = rotl64_hi(st[13], st[12], 44);
    st[12] = rotl64_lo(st[19], st[18], 20);
    st[13] = rotl64_hi(st[19], st[18], 20);
    st[18] = rotl64_lo(st[45], st[44], 61);
    st[19] = rotl64_hi(st[45], st[44], 61);
    st[44] = rotl64_lo(st[29], st[28], 39);
    st[45] = rotl64_hi(st[29], st[28], 39);
    st[28] = rotl64_lo(st[41], st[40], 18);
    st[29] = rotl64_hi(st[41], st[40], 18);
    st[40] = rotl64_lo(st[ 5], st[ 4], 62);
    st[41] = rotl64_hi(st[ 5], st[ 4], 62);
    st[ 4] = rotl64_lo(st[25], st[24], 43);
    st[ 5] = rotl64_hi(st[25], st[24], 43);
    st[24] = rotl64_lo(st[27], st[26], 25);
    st[25] = rotl64_hi(st[27], st[26], 25);
    st[26] = rotl64_lo(st[39], st[38],  8);
    st[27] = rotl64_hi(st[39], st[38],  8);
    st[38] = rotl64_lo(st[47], st[46], 56);
    st[39] = rotl64_hi(st[47], st[46], 56);
    st[46] = rotl64_lo(st[31], st[30], 41);
    st[47] = rotl64_hi(st[31], st[30], 41);
    st[30] = rotl64_lo(st[ 9], st[ 8], 27);
    st[31] = rotl64_hi(st[ 9], st[ 8], 27);
    st[ 8] = rotl64_lo(st[49], st[48], 14);
    st[ 9] = rotl64_hi(st[49], st[48], 14);
    st[48] = rotl64_lo(st[43], st[42],  2);
    st[49] = rotl64_hi(st[43], st[42],  2);
    st[42] = rotl64_lo(st[17], st[16], 55);
    st[43] = rotl64_hi(st[17], st[16], 55);
    st[16] = rotl64_lo(st[33], st[32], 45);
    st[17] = rotl64_hi(st[33], st[32], 45);
    st[32] = rotl64_lo(st[11], st[10], 36);
    st[33] = rotl64_hi(st[11], st[10], 36);
    st[10] = rotl64_lo(st[ 7], st[ 6], 28);
    st[11] = rotl64_hi(st[ 7], st[ 6], 28);
    st[ 6] = rotl64_lo(st[37], st[36], 21);
    st[ 7] = rotl64_hi(st[37], st[36], 21);
    st[36] = rotl64_lo(st[35], st[34], 15);
    st[37] = rotl64_hi(st[35], st[34], 15);
    st[34] = rotl64_lo(st[23], st[22], 10);
    st[35] = rotl64_hi(st[23], st[22], 10);
    st[22] = rotl64_lo(st[15], st[14],  6);
    st[23] = rotl64_hi(st[15], st[14],  6);
    st[14] = rotl64_lo(st[21], st[20],  3);
    st[15] = rotl64_hi(st[21], st[20],  3);
    st[20] = rotl64_lo(t[1], t[0], 1);
    st[21] = rotl64_hi(t[1], t[0], 1);

    // Chi

    // Unrolling of the following loop (last iteration sets bc[0] thru bc[9])

    // for (i=0; i<40; i+=10) {
    //   bc[0] = st[i];
    //   bc[1] = st[i+1];
    //   bc[2] = st[i+2];
    //   bc[3] = st[i+3];

    //   st[i  ] ^= (~st[i+2]) & st[i+4];
    //   st[i+1] ^= (~st[i+3]) & st[i+5];
    //   st[i+2] ^= (~st[i+4]) & st[i+6];
    //   st[i+3] ^= (~st[i+5]) & st[i+7];
    //   st[i+4] ^= (~st[i+6]) & st[i+8];
    //   st[i+5] ^= (~st[i+7]) & st[i+9];
    //   st[i+6] ^= (~st[i+8]) & bc[0];
    //   st[i+7] ^= (~st[i+9]) & bc[1];
    //   st[i+8] ^= (~bc[0]) & bc[2];
    //   st[i+9] ^= (~bc[1]) & bc[3];
    // }

    i = 0;
    bc[0] = st[i];
    bc[1] = st[i+1];
    bc[2] = st[i+2];
    bc[3] = st[i+3];

    st[i  ] ^= (~st[i+2]) & st[i+4];
    st[i+1] ^= (~st[i+3]) & st[i+5];
    st[i+2] ^= (~st[i+4]) & st[i+6];
    st[i+3] ^= (~st[i+5]) & st[i+7];
    st[i+4] ^= (~st[i+6]) & st[i+8];
    st[i+5] ^= (~st[i+7]) & st[i+9];
    st[i+6] ^= (~st[i+8]) & bc[0];
    st[i+7] ^= (~st[i+9]) & bc[1];
    st[i+8] ^= (~bc[0]) & bc[2];
    st[i+9] ^= (~bc[1]) & bc[3];

    i = 10;
    bc[0] = st[i];
    bc[1] = st[i+1];
    bc[2] = st[i+2];
    bc[3] = st[i+3];

    st[i  ] ^= (~st[i+2]) & st[i+4];
    st[i+1] ^= (~st[i+3]) & st[i+5];
    st[i+2] ^= (~st[i+4]) & st[i+6];
    st[i+3] ^= (~st[i+5]) & st[i+7];
    st[i+4] ^= (~st[i+6]) & st[i+8];
    st[i+5] ^= (~st[i+7]) & st[i+9];
    st[i+6] ^= (~st[i+8]) & bc[0];
    st[i+7] ^= (~st[i+9]) & bc[1];
    st[i+8] ^= (~bc[0]) & bc[2];
    st[i+9] ^= (~bc[1]) & bc[3];

    i = 20;
    bc[0] = st[i];
    bc[1] = st[i+1];
    bc[2] = st[i+2];
    bc[3] = st[i+3];

    st[i  ] ^= (~st[i+2]) & st[i+4];
    st[i+1] ^= (~st[i+3]) & st[i+5];
    st[i+2] ^= (~st[i+4]) & st[i+6];
    st[i+3] ^= (~st[i+5]) & st[i+7];
    st[i+4] ^= (~st[i+6]) & st[i+8];
    st[i+5] ^= (~st[i+7]) & st[i+9];
    st[i+6] ^= (~st[i+8]) & bc[0];
    st[i+7] ^= (~st[i+9]) & bc[1];
    st[i+8] ^= (~bc[0]) & bc[2];
    st[i+9] ^= (~bc[1]) & bc[3];

    i = 30;
    bc[0] = st[i];
    bc[1] = st[i+1];
    bc[2] = st[i+2];
    bc[3] = st[i+3];

    st[i  ] ^= (~st[i+2]) & st[i+4];
    st[i+1] ^= (~st[i+3]) & st[i+5];
    st[i+2] ^= (~st[i+4]) & st[i+6];
    st[i+3] ^= (~st[i+5]) & st[i+7];
    st[i+4] ^= (~st[i+6]) & st[i+8];
    st[i+5] ^= (~st[i+7]) & st[i+9];
    st[i+6] ^= (~st[i+8]) & bc[0];
    st[i+7] ^= (~st[i+9]) & bc[1];
    st[i+8] ^= (~bc[0]) & bc[2];
    st[i+9] ^= (~bc[1]) & bc[3];

    i = 40;
    bc[0] = st[i];
    bc[1] = st[i+1];
    bc[2] = st[i+2];
    bc[3] = st[i+3];
    bc[4] = st[i+4];
    bc[5] = st[i+5];
    bc[6] = st[i+6];
    bc[7] = st[i+7];
    bc[8] = st[i+8];
    bc[9] = st[i+9];

    st[i  ] ^= (~st[i+2]) & st[i+4];
    st[i+1] ^= (~st[i+3]) & st[i+5];
    st[i+2] ^= (~st[i+4]) & st[i+6];
    st[i+3] ^= (~st[i+5]) & st[i+7];
    st[i+4] ^= (~st[i+6]) & st[i+8];
    st[i+5] ^= (~st[i+7]) & st[i+9];
    st[i+6] ^= (~st[i+8]) & bc[0];
    st[i+7] ^= (~st[i+9]) & bc[1];
    st[i+8] ^= (~bc[0]) & bc[2];
    st[i+9] ^= (~bc[1]) & bc[3];

    // Iota
    var round_index = round*2;
    st[0] ^= keccakf_rndc[round_index+1];
    st[1] ^= keccakf_rndc[round_index];
  }
  return st;
}

// function to rotate two 32 bit ints (as if they are one 64 bit int)
// n is never 0, so we don't check for that case
// Because of Javascript treating all bitwise operations as done on
// *signed* 32-bit integers, we need to sprinkle in many ">>> 0" ops
// throughout the code and we also have to always shift by ">>>" not ">>"
// see here: https://stackoverflow.com/questions/6798111/bitwise-operations-on-32-bit-unsigned-ints

function rotl64_hi(hi, lo, offset) {

  if (offset & 0x20) { // if offset >= 32 we switch lo and hi
    lo ^= hi;
    hi ^= lo;
    lo ^= hi;
  }

  return (((hi << offset) >>> 0) | (lo >>> (32 - offset))) >>> 0;
}

function rotl64_lo(hi, lo, offset) {

  if (offset & 0x20) { // if offset >= 32 we switch lo and hi
    lo ^= hi;
    hi ^= lo;
    lo ^= hi;
  }

  return (((lo << offset) >>> 0) | (hi >>> (32 - offset))) >>> 0;
}

// i think some of the convenience functions from
// https://gist.github.com/devi/27a8bfcf77eb765540d2
// would be a good place to look to make this easier to implement
	// TODO: Finish this up, test speed vs GPU vs emscripten transpiled C->asm.js
	// Currently looking at the keccak function, which is given as input:
	// input - in our case the "caveat emptor" string
	// inlen - length of input = 13
	// md - the buffer of 200 bytes we want to fill up
	// stlen - length of uint64_t array with 25 elements = 200 bytes
	// ( aka: output length we desire )

	// Reading through the code there is:
	// a temp buffer of 144 bytes
	// a st buffer of 200 bytes
	// rsiz = 136
	// rsizw = 136 / 8 = 17

	// rsiz and rsizw are "rate" and "rateInBytes" in the Keccak reference, but "rate"
	// in Keccak reference is 1088, thus "rateInBytes" should be 136...?

	// There is a loop which is sometimes skipped, but for now we will ignore it,
	// it seems to move along the input and do some computations until it is only
	// looking at input where input is lt rsiz (rsiz == 136)

	// This loop seems only to be triggered when the input is gte rsiz (rsiz == 136)

	// Afterwards we copy the input (or what's left of it) to the temp buffer
	// Then we add a 1 to the end of what's been copied and pad the temp buffer up
	// to rsiz size (i.e. until it is 136 bytes) with 0s
	// Then we bitwise OR the last byte of rsiz (i.e. temp[rsiz-1]) with 0x80

	// Then in a for loop for rsizw (rsizw == 17) rounds we do this
	// in every iteration
	// for every 8 bytes of st,
	// we set those 8 bytes to bitwise XOR of those 8 bytes with 8 bytes from temp


	// Now we call keccakf on st with the number of Keccak Rounds set at 24

	// Then we memcpy st into md and return (voila! keccak hash!)

	// the keccakf function is given the following inputs:
	// st - a buffer of 8 byte values with 25 values in it
	// rounds - number of rounds (in our case 24)

	// we then initialize the following vars
	// t - an 8 byte value
	// bc - a 5 value array of 8 byte values

	// Thus, for each round there are 4 steps. I will leave the reading of this code
	// up to you, since it is pretty descriptive in and of itself. Only here I will
	// note possible speed up points

	// In the Theta step, the second loop has you calculating 5 t values and successively
	// XORing them with different values in st. It would probably be faster to calculate
	// the 5 t values all at once (in an unrolled sort of way) and then successively
	// XOR all the values in st all at once. (play with some XORing to make sure)

	// In the Rho Pi step, I think having an extra variable is more readable than
	// bc[0] = st[j]
	// st[j] = blah(t, ajksf)
	// t = bc[0]

	// In the Chi step you can definitely shave off some loops, just need to experiment
	// with how in a scratch C program.

	// In the Iota step you are lastly taking a constant and XORing it with st[0]

	// Congrats! Now you can implement a keccak hash in JavaScript (although it seems
	// to differ a bit from the reference keccak hash)

	// New place to see a good Keccak reference (all optimizations are there):
	// https://github.com/lucasjones/cpuminer-multi/tree/master/crypto
	// (newer miner with ops) https://github.com/fireice-uk/xmr-stak-cpu/tree/master/crypto

	// function keccak(input, inlen, md, mdlen) { //md, mdlen here not necessary. mdlen==200

	function print64bit(hi, lo) {
	hi = hi.toString(2);
	lo = lo.toString(2);
	if (hi.length < 32) {
	var padding = 32 - hi.length;
	for (var i = 0; i < padding; i++) {
	hi = "0" + hi;
	}
	}
	if (lo.length < 32) {
	var padding = 32 - lo.length;
	for (var i = 0; i < padding; i++) {
	lo = "0" + lo;
	}
	}
	console.log(hi + " " + lo);
	}

	module.exports = {
	keccak: keccak
	}

	function keccak(input, inlen) {
	const KECCAK_ROUNDS = 24;
	const HASH_DATA_AREA = 136;
	var state_buffer = new ArrayBuffer(200);
	var st = new Uint32Array(state_buffer);

	// original code runs on 64-bit data, since JS can only handle 32
	// we divide by 4 (bytes) instead of 8 (bytes) as in original code
	var rsizw = HASH_DATA_AREA/4;

	// for loop which gets skipped on small inputs!!!
	var end = 0;
	console.log(inlen);
	var special_epochs = Math.floor(inlen / HASH_DATA_AREA);
	if (special_epochs > 0) {
	inlen = inlen % HASH_DATA_AREA;
	for (var j = 0; j < special_epochs; ++j) {
	var start = j*HASH_DATA_AREA;
	end = start+HASH_DATA_AREA;
	var temp = new Uint8Array(144);
	temp.set(input.subarray(start, start+HASH_DATA_AREA));
	var ttemp = new Uint32Array(temp.buffer);
	for (i=0; i<rsizw; ++i) {
	st[i] ^= ttemp[i];
	}
	keccakf(st, KECCAK_ROUNDS);
	}
	}

	var temp = new Uint8Array(144);
	temp.set(input.subarray(end,));
	temp[inlen++] = 1;
	temp[HASH_DATA_AREA-1] \|= 0x80;
	var ttemp = new Uint32Array(temp.buffer);

	// var ttemp = new Uint32Array(temp.buffer);

	for (i=0; i<rsizw; ++i) {
	if (i%2 == 1) {
	print64bit(ttemp[i], ttemp[i-1]);
	}
	st[i] ^= ttemp[i];
	}

	keccakf(st, KECCAK_ROUNDS);

	return state_buffer;
	}

	function keccakf(st, rounds) {
	// const keccakf_piln = [10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
	// 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 ];
	// const keccakf_rotc = [1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
	// 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44];
	const keccakf_rndc = [0x00000000, 0x00000001, 0x00000000, 0x00008082, 0x80000000, 0x0000808a,
	0x80000000, 0x80008000, 0x00000000, 0x0000808b, 0x00000000, 0x80000001,
	0x80000000, 0x80008081, 0x80000000, 0x00008009, 0x00000000, 0x0000008a,
	0x00000000, 0x00000088, 0x00000000, 0x80008009, 0x00000000, 0x8000000a,
	0x00000000, 0x8000808b, 0x80000000, 0x0000008b, 0x80000000, 0x00008089,
	0x80000000, 0x00008003, 0x80000000, 0x00008002, 0x80000000, 0x00000080,
	0x00000000, 0x0000800a, 0x80000000, 0x8000000a, 0x80000000, 0x80008081,
	0x80000000, 0x00008080, 0x00000000, 0x80000001, 0x80000000, 0x80008008];

	// remember that st is a 32 bit array, NOT 64 like in the orig code
	var t = new Uint32Array(2);
	var bc = new Uint32Array(10);

	for (var round=0; round<rounds; ++round) {

	// Theta
	bc[0] = st[0] ^ st[10] ^ st[20] ^ st[30] ^ st[40];
	bc[1] = st[1] ^ st[11] ^ st[21] ^ st[31] ^ st[41];
	bc[2] = st[2] ^ st[12] ^ st[22] ^ st[32] ^ st[42];
	bc[3] = st[3] ^ st[13] ^ st[23] ^ st[33] ^ st[43];
	bc[4] = st[4] ^ st[14] ^ st[24] ^ st[34] ^ st[44];
	bc[5] = st[5] ^ st[15] ^ st[25] ^ st[35] ^ st[45];
	bc[6] = st[6] ^ st[16] ^ st[26] ^ st[36] ^ st[46];
	bc[7] = st[7] ^ st[17] ^ st[27] ^ st[37] ^ st[47];
	bc[8] = st[8] ^ st[18] ^ st[28] ^ st[38] ^ st[48];
	bc[9] = st[9] ^ st[19] ^ st[29] ^ st[39] ^ st[49];

	for (var i=0; i<10; i+=2) { // must unroll loop as modulo 10 is expensive on GPU
	var hi = bc[(i+3) % 10];
	var lo = bc[(i+2) % 10];
	t[0] = bc[(i+8) % 10] ^ rotl64_lo(hi, lo, 1);
	t[1] = bc[(i+9) % 10] ^ rotl64_hi(hi, lo, 1);

	st[i] ^= t[0];
	st[i+1] ^= t[1];
	st[i+10] ^= t[0];
	st[i+11] ^= t[1];
	st[i+20] ^= t[0];
	st[i+21] ^= t[1];
	st[i+30] ^= t[0];
	st[i+31] ^= t[1];
	st[i+40] ^= t[0];
	st[i+41] ^= t[1];
	}

	// Rho Pi
	t[0] = st[2];
	t[1] = st[3];

	st[ 2] = rotl64_lo(st[13], st[12], 44);
	st[ 3] = rotl64_hi(st[13], st[12], 44);
	st[12] = rotl64_lo(st[19], st[18], 20);
	st[13] = rotl64_hi(st[19], st[18], 20);
	st[18] = rotl64_lo(st[45], st[44], 61);
	st[19] = rotl64_hi(st[45], st[44], 61);
	st[44] = rotl64_lo(st[29], st[28], 39);
	st[45] = rotl64_hi(st[29], st[28], 39);
	st[28] = rotl64_lo(st[41], st[40], 18);
	st[29] = rotl64_hi(st[41], st[40], 18);
	st[40] = rotl64_lo(st[ 5], st[ 4], 62);
	st[41] = rotl64_hi(st[ 5], st[ 4], 62);
	st[ 4] = rotl64_lo(st[25], st[24], 43);
	st[ 5] = rotl64_hi(st[25], st[24], 43);
	st[24] = rotl64_lo(st[27], st[26], 25);
	st[25] = rotl64_hi(st[27], st[26], 25);
	st[26] = rotl64_lo(st[39], st[38], 8);
	st[27] = rotl64_hi(st[39], st[38], 8);
	st[38] = rotl64_lo(st[47], st[46], 56);
	st[39] = rotl64_hi(st[47], st[46], 56);
	st[46] = rotl64_lo(st[31], st[30], 41);
	st[47] = rotl64_hi(st[31], st[30], 41);
	st[30] = rotl64_lo(st[ 9], st[ 8], 27);
	st[31] = rotl64_hi(st[ 9], st[ 8], 27);
	st[ 8] = rotl64_lo(st[49], st[48], 14);
	st[ 9] = rotl64_hi(st[49], st[48], 14);
	st[48] = rotl64_lo(st[43], st[42], 2);
	st[49] = rotl64_hi(st[43], st[42], 2);
	st[42] = rotl64_lo(st[17], st[16], 55);
	st[43] = rotl64_hi(st[17], st[16], 55);
	st[16] = rotl64_lo(st[33], st[32], 45);
	st[17] = rotl64_hi(st[33], st[32], 45);
	st[32] = rotl64_lo(st[11], st[10], 36);
	st[33] = rotl64_hi(st[11], st[10], 36);
	st[10] = rotl64_lo(st[ 7], st[ 6], 28);
	st[11] = rotl64_hi(st[ 7], st[ 6], 28);
	st[ 6] = rotl64_lo(st[37], st[36], 21);
	st[ 7] = rotl64_hi(st[37], st[36], 21);
	st[36] = rotl64_lo(st[35], st[34], 15);
	st[37] = rotl64_hi(st[35], st[34], 15);
	st[34] = rotl64_lo(st[23], st[22], 10);
	st[35] = rotl64_hi(st[23], st[22], 10);
	st[22] = rotl64_lo(st[15], st[14], 6);
	st[23] = rotl64_hi(st[15], st[14], 6);
	st[14] = rotl64_lo(st[21], st[20], 3);
	st[15] = rotl64_hi(st[21], st[20], 3);
	st[20] = rotl64_lo(t[1], t[0], 1);
	st[21] = rotl64_hi(t[1], t[0], 1);

	// Chi

	// Unrolling of the following loop (last iteration sets bc[0] thru bc[9])

	// for (i=0; i<40; i+=10) {
	// bc[0] = st[i];
	// bc[1] = st[i+1];
	// bc[2] = st[i+2];
	// bc[3] = st[i+3];

	// st[i ] ^= (~st[i+2]) & st[i+4];
	// st[i+1] ^= (~st[i+3]) & st[i+5];
	// st[i+2] ^= (~st[i+4]) & st[i+6];
	// st[i+3] ^= (~st[i+5]) & st[i+7];
	// st[i+4] ^= (~st[i+6]) & st[i+8];
	// st[i+5] ^= (~st[i+7]) & st[i+9];
	// st[i+6] ^= (~st[i+8]) & bc[0];
	// st[i+7] ^= (~st[i+9]) & bc[1];
	// st[i+8] ^= (~bc[0]) & bc[2];
	// st[i+9] ^= (~bc[1]) & bc[3];
	// }

	i = 0;
	bc[0] = st[i];
	bc[1] = st[i+1];
	bc[2] = st[i+2];
	bc[3] = st[i+3];

	st[i ] ^= (~st[i+2]) & st[i+4];
	st[i+1] ^= (~st[i+3]) & st[i+5];
	st[i+2] ^= (~st[i+4]) & st[i+6];
	st[i+3] ^= (~st[i+5]) & st[i+7];
	st[i+4] ^= (~st[i+6]) & st[i+8];
	st[i+5] ^= (~st[i+7]) & st[i+9];
	st[i+6] ^= (~st[i+8]) & bc[0];
	st[i+7] ^= (~st[i+9]) & bc[1];
	st[i+8] ^= (~bc[0]) & bc[2];
	st[i+9] ^= (~bc[1]) & bc[3];

	i = 10;
	bc[0] = st[i];
	bc[1] = st[i+1];
	bc[2] = st[i+2];
	bc[3] = st[i+3];

	st[i ] ^= (~st[i+2]) & st[i+4];
	st[i+1] ^= (~st[i+3]) & st[i+5];
	st[i+2] ^= (~st[i+4]) & st[i+6];
	st[i+3] ^= (~st[i+5]) & st[i+7];
	st[i+4] ^= (~st[i+6]) & st[i+8];
	st[i+5] ^= (~st[i+7]) & st[i+9];
	st[i+6] ^= (~st[i+8]) & bc[0];
	st[i+7] ^= (~st[i+9]) & bc[1];
	st[i+8] ^= (~bc[0]) & bc[2];
	st[i+9] ^= (~bc[1]) & bc[3];

	i = 20;
	bc[0] = st[i];
	bc[1] = st[i+1];
	bc[2] = st[i+2];
	bc[3] = st[i+3];

	st[i ] ^= (~st[i+2]) & st[i+4];
	st[i+1] ^= (~st[i+3]) & st[i+5];
	st[i+2] ^= (~st[i+4]) & st[i+6];
	st[i+3] ^= (~st[i+5]) & st[i+7];
	st[i+4] ^= (~st[i+6]) & st[i+8];
	st[i+5] ^= (~st[i+7]) & st[i+9];
	st[i+6] ^= (~st[i+8]) & bc[0];
	st[i+7] ^= (~st[i+9]) & bc[1];
	st[i+8] ^= (~bc[0]) & bc[2];
	st[i+9] ^= (~bc[1]) & bc[3];

	i = 30;
	bc[0] = st[i];
	bc[1] = st[i+1];
	bc[2] = st[i+2];
	bc[3] = st[i+3];

	st[i ] ^= (~st[i+2]) & st[i+4];
	st[i+1] ^= (~st[i+3]) & st[i+5];
	st[i+2] ^= (~st[i+4]) & st[i+6];
	st[i+3] ^= (~st[i+5]) & st[i+7];
	st[i+4] ^= (~st[i+6]) & st[i+8];
	st[i+5] ^= (~st[i+7]) & st[i+9];
	st[i+6] ^= (~st[i+8]) & bc[0];
	st[i+7] ^= (~st[i+9]) & bc[1];
	st[i+8] ^= (~bc[0]) & bc[2];
	st[i+9] ^= (~bc[1]) & bc[3];

	i = 40;
	bc[0] = st[i];
	bc[1] = st[i+1];
	bc[2] = st[i+2];
	bc[3] = st[i+3];
	bc[4] = st[i+4];
	bc[5] = st[i+5];
	bc[6] = st[i+6];
	bc[7] = st[i+7];
	bc[8] = st[i+8];
	bc[9] = st[i+9];

	st[i ] ^= (~st[i+2]) & st[i+4];
	st[i+1] ^= (~st[i+3]) & st[i+5];
	st[i+2] ^= (~st[i+4]) & st[i+6];
	st[i+3] ^= (~st[i+5]) & st[i+7];
	st[i+4] ^= (~st[i+6]) & st[i+8];
	st[i+5] ^= (~st[i+7]) & st[i+9];
	st[i+6] ^= (~st[i+8]) & bc[0];
	st[i+7] ^= (~st[i+9]) & bc[1];
	st[i+8] ^= (~bc[0]) & bc[2];
	st[i+9] ^= (~bc[1]) & bc[3];

	// Iota
	var round_index = round*2;
	st[0] ^= keccakf_rndc[round_index+1];
	st[1] ^= keccakf_rndc[round_index];
	}
	return st;
	}

	// function to rotate two 32 bit ints (as if they are one 64 bit int)
	// n is never 0, so we don't check for that case
	// Because of Javascript treating all bitwise operations as done on
	// signed 32-bit integers, we need to sprinkle in many ">>> 0" ops
	// throughout the code and we also have to always shift by ">>>" not ">>"
	// see here: https://stackoverflow.com/questions/6798111/bitwise-operations-on-32-bit-unsigned-ints

	function rotl64_hi(hi, lo, offset) {

	if (offset & 0x20) { // if offset >= 32 we switch lo and hi
	lo ^= hi;
	hi ^= lo;
	lo ^= hi;
	}

	return (((hi << offset) >>> 0) \| (lo >>> (32 - offset))) >>> 0;
	}

	function rotl64_lo(hi, lo, offset) {

	if (offset & 0x20) { // if offset >= 32 we switch lo and hi
	lo ^= hi;
	hi ^= lo;
	lo ^= hi;
	}

	return (((lo << offset) >>> 0) \| (hi >>> (32 - offset))) >>> 0;
	}

	// i think some of the convenience functions from
	// https://gist.github.com/devi/27a8bfcf77eb765540d2
	// would be a good place to look to make this easier to implement