nealmcb/sampling_with_tickets.ipynb

## sampling_with_tickets.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demo of Ron Rivest's Election Audit Sampling with Tickets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ron Rivest has proposed an approach, called \"sampling with tickets\", allowing a risk-limiting audit to proceed independently in different counties in parallel, or a single county at different points in time, while still achieving an easy, efficient non-stratified sample.\n",
    "\n",
    "This is a quick demo of Ron's approach, in Python 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import hashlib\n",
    "import heapq\n",
    "import collections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "Ballot = collections.namedtuple('Ballot', 'ticket, id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sha256(value):\n",
    "    \"\"\"Return the SHA-256 hash of the given string as a hex digest\n",
    "\n",
    "    >>> sha256('')\n",
    "    'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'\n",
    "    \"\"\"\n",
    "\n",
    "    # for python 2, may need: if isinstance(value, unicode):        value = value.encode('utf-8')\n",
    "    return hashlib.sha256(value.encode('utf-8')).hexdigest()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def setup_ballots(N, seed):\n",
    "    \"return a heap of ballots\"\n",
    "\n",
    "    ballots = []\n",
    "\n",
    "    for ballot_id in range(N):\n",
    "        heapq.heappush(ballots, Ballot(\n",
    "                int(sha256(seed + str(ballot_id)), 16) * 2 ** -256,\n",
    "                       ballot_id))\n",
    "\n",
    "    return ballots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sample_ballots(ballot_spec, seed, sample_sizes):\n",
    "    \"\"\"For each round size in sample_sizes, return a sample with replacement of the given size\n",
    "    ballot_spec is either an integer or (someday) the name of a manifest file.\n",
    "    sample_sizes is an array of round sizes.\n",
    "\n",
    "    TODO: if \"ballots\" is a file name, interpret it as a manifest\n",
    "    TODO: Each round should be sorted by id.\n",
    "    \"\"\"\n",
    "\n",
    "    if isinstance(ballot_spec, int):\n",
    "        ballots = setup_ballots(ballot_spec, seed)\n",
    "\n",
    "    # Maintain the array \"ballots\" as a heap with the invariant that all ballots\n",
    "    # are uniformly distributed over the range [z, 1.0], where z is the ticket\n",
    "    # number of the most recent sample.\n",
    "    \n",
    "    samples = []\n",
    "    for sample_size in sample_sizes:\n",
    "        sample = []\n",
    "        for _ in range (sample_size):\n",
    "            next_selection = heapq.heappop(ballots)\n",
    "            sample.append(next_selection)\n",
    "            next_selection = Ballot(random.uniform(next_selection.ticket, 1.0), next_selection.id)\n",
    "            heapq.heappush(ballots, next_selection)\n",
    "        samples.append(sample)\n",
    "\n",
    "    merged_samples = [sample for round in samples for sample in round]\n",
    "    counts = collections.Counter([ballot.id for ballot in merged_samples])\n",
    "    dups = [(id, count) for id, count in counts.items() if count > 1]\n",
    "    print(\"Note: dups ([id, count), ...]: %s\" % dups)\n",
    "    \n",
    "    return samples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sampling with replacement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: dups ([id, count), ...]: [(2, 2)]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[[Ballot(ticket=0.04100190558301588, id=13),\n",
       "  Ballot(ticket=0.0423833995335369, id=19),\n",
       "  Ballot(ticket=0.07352435971093178, id=4),\n",
       "  Ballot(ticket=0.11101993350291864, id=10),\n",
       "  Ballot(ticket=0.1775780699245684, id=2)],\n",
       " [Ballot(ticket=0.19977805644581484, id=14),\n",
       "  Ballot(ticket=0.22148899861362173, id=16),\n",
       "  Ballot(ticket=0.24878913499709562, id=9),\n",
       "  Ballot(ticket=0.32328121459631143, id=2),\n",
       "  Ballot(ticket=0.3306671180518882, id=18)]]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_ballots(20, \"12345\", [5, 5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sampling without replacement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Ballot(ticket=0.014349416384859667, id=4),\n",
       " Ballot(ticket=0.11322107598015423, id=16),\n",
       " Ballot(ticket=0.1352993540969171, id=0),\n",
       " Ballot(ticket=0.14647280636254867, id=13),\n",
       " Ballot(ticket=0.19160349460439044, id=5),\n",
       " Ballot(ticket=0.19489100529351447, id=15),\n",
       " Ballot(ticket=0.2669856231270792, id=14),\n",
       " Ballot(ticket=0.2747320876434722, id=3),\n",
       " Ballot(ticket=0.29058138956157303, id=19),\n",
       " Ballot(ticket=0.32285949040380824, id=1)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ballots = setup_ballots(20, \"123\")\n",
    "sample = heapq.nsmallest(10, ballots)\n",
    "sample"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "95.234388px",
    "width": "253.984188px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Demo of Ron Rivest's Election Audit Sampling with Tickets"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Ron Rivest has proposed an approach, called \"sampling with tickets\", allowing a risk-limiting audit to proceed independently in different counties in parallel, or a single county at different points in time, while still achieving an easy, efficient non-stratified sample.\n",
	"\n",
	"This is a quick demo of Ron's approach, in Python 3"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import random\n",
	"import hashlib\n",
	"import heapq\n",
	"import collections"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"Ballot = collections.namedtuple('Ballot', 'ticket, id')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"def sha256(value):\n",
	" \"\"\"Return the SHA-256 hash of the given string as a hex digest\n",
	"\n",
	" >>> sha256('')\n",
	" 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'\n",
	" \"\"\"\n",
	"\n",
	" # for python 2, may need: if isinstance(value, unicode): value = value.encode('utf-8')\n",
	" return hashlib.sha256(value.encode('utf-8')).hexdigest()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"def setup_ballots(N, seed):\n",
	" \"return a heap of ballots\"\n",
	"\n",
	" ballots = []\n",
	"\n",
	" for ballot_id in range(N):\n",
	" heapq.heappush(ballots, Ballot(\n",
	" int(sha256(seed + str(ballot_id)), 16) * 2 ** -256,\n",
	" ballot_id))\n",
	"\n",
	" return ballots"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"def sample_ballots(ballot_spec, seed, sample_sizes):\n",
	" \"\"\"For each round size in sample_sizes, return a sample with replacement of the given size\n",
	" ballot_spec is either an integer or (someday) the name of a manifest file.\n",
	" sample_sizes is an array of round sizes.\n",
	"\n",
	" TODO: if \"ballots\" is a file name, interpret it as a manifest\n",
	" TODO: Each round should be sorted by id.\n",
	" \"\"\"\n",
	"\n",
	" if isinstance(ballot_spec, int):\n",
	" ballots = setup_ballots(ballot_spec, seed)\n",
	"\n",
	" # Maintain the array \"ballots\" as a heap with the invariant that all ballots\n",
	" # are uniformly distributed over the range [z, 1.0], where z is the ticket\n",
	" # number of the most recent sample.\n",
	" \n",
	" samples = []\n",
	" for sample_size in sample_sizes:\n",
	" sample = []\n",
	" for _ in range (sample_size):\n",
	" next_selection = heapq.heappop(ballots)\n",
	" sample.append(next_selection)\n",
	" next_selection = Ballot(random.uniform(next_selection.ticket, 1.0), next_selection.id)\n",
	" heapq.heappush(ballots, next_selection)\n",
	" samples.append(sample)\n",
	"\n",
	" merged_samples = [sample for round in samples for sample in round]\n",
	" counts = collections.Counter([ballot.id for ballot in merged_samples])\n",
	" dups = [(id, count) for id, count in counts.items() if count > 1]\n",
	" print(\"Note: dups ([id, count), ...]: %s\" % dups)\n",
	" \n",
	" return samples"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Sampling with replacement"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Note: dups ([id, count), ...]: [(2, 2)]\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"[[Ballot(ticket=0.04100190558301588, id=13),\n",
	" Ballot(ticket=0.0423833995335369, id=19),\n",
	" Ballot(ticket=0.07352435971093178, id=4),\n",
	" Ballot(ticket=0.11101993350291864, id=10),\n",
	" Ballot(ticket=0.1775780699245684, id=2)],\n",
	" [Ballot(ticket=0.19977805644581484, id=14),\n",
	" Ballot(ticket=0.22148899861362173, id=16),\n",
	" Ballot(ticket=0.24878913499709562, id=9),\n",
	" Ballot(ticket=0.32328121459631143, id=2),\n",
	" Ballot(ticket=0.3306671180518882, id=18)]]"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sample_ballots(20, \"12345\", [5, 5])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Sampling without replacement"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[Ballot(ticket=0.014349416384859667, id=4),\n",
	" Ballot(ticket=0.11322107598015423, id=16),\n",
	" Ballot(ticket=0.1352993540969171, id=0),\n",
	" Ballot(ticket=0.14647280636254867, id=13),\n",
	" Ballot(ticket=0.19160349460439044, id=5),\n",
	" Ballot(ticket=0.19489100529351447, id=15),\n",
	" Ballot(ticket=0.2669856231270792, id=14),\n",
	" Ballot(ticket=0.2747320876434722, id=3),\n",
	" Ballot(ticket=0.29058138956157303, id=19),\n",
	" Ballot(ticket=0.32285949040380824, id=1)]"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ballots = setup_ballots(20, \"123\")\n",
	"sample = heapq.nsmallest(10, ballots)\n",
	"sample"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	},
	"toc": {
	"colors": {
	"hover_highlight": "#DAA520",
	"running_highlight": "#FF0000",
	"selected_highlight": "#FFD700"
	},
	"moveMenuLeft": true,
	"nav_menu": {
	"height": "95.234388px",
	"width": "253.984188px"
	},
	"navigate_menu": true,
	"number_sections": true,
	"sideBar": true,
	"threshold": 4,
	"toc_cell": false,
	"toc_section_display": "block",
	"toc_window_display": false,
	"widenNotebook": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}