mikk-c/07_sis_lp_exercise.ipynb

## 07_sis_lp_exercise.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random, itertools\n",
    "import networkx as nx\n",
    "\n",
    "# From http://interactome.dfci.harvard.edu/S_cerevisiae/download/Collins.txt\n",
    "G = nx.read_edgelist(\"collins.txt\", delimiter = \"\\t\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1.1\n",
    "\"\"\"\n",
    "It is always a good idea to break complex problems into their\n",
    "simpler component. Since in SIS we need to perform multiple\n",
    "tasks let's have a function implementing each one of those\n",
    "tasks. First, we implement the S-I step. Given a graph G and\n",
    "a set of currently infected nodes (seeds), we iterate over\n",
    "their neighbors and we toss a coin: with probability beta we\n",
    "add the neighbor to the new infected pool. (This is how the\n",
    "threshold model works. If we were implementing cascade or\n",
    "percolation the function would be very different: you would\n",
    "have to iterate over the sane nodes, and count the number of\n",
    "infected neighbors). Note the first line: we need to copy the\n",
    "seed set, because we cannot change a set over which we're\n",
    "iterating.\n",
    "\"\"\"\n",
    "\n",
    "def s_to_i(G, seeds, beta):\n",
    "   new_infected = seeds.copy()\n",
    "   for seed in seeds:\n",
    "      seed_neighbors = G.neighbors(seed)\n",
    "      for neighbor in seed_neighbors:\n",
    "         if random.random() < beta:\n",
    "            new_infected.add(neighbor)\n",
    "   return new_infected\n",
    "\n",
    "\"\"\"\n",
    "Now let's tackle the I-S step. This is simpler: we don't need\n",
    "the graph. Each node has a stochastic recovery probability, no\n",
    "matter the number of infected neighbors. So let's just iterate\n",
    "over the infected nodes, and toss the recovery coin: with\n",
    "probability mu the node recovers. Again, remember to first copy\n",
    "the seed set, since you're iterating over it. (Note, if this\n",
    "were a SIR model, this function would be different! You'll have\n",
    "to have a \"recovered\" node set, and if a node is there, then it\n",
    "cannot be infected any more. If this were an SI model, this\n",
    "function would not exist!)\n",
    "\"\"\"\n",
    "\n",
    "def i_to_s(seeds, mu):\n",
    "   new_infected = seeds.copy()\n",
    "   for seed in seeds:\n",
    "      if random.random() < mu:\n",
    "         new_infected.remove(seed)\n",
    "   return new_infected\n",
    "\n",
    "\"\"\"\n",
    "The main function! Once the S-I and I-S steps are implemented,\n",
    "it is very simple: iterate for the required number of steps\n",
    "and call S-I and I-S in succession. Print the result.\n",
    "\"\"\"\n",
    "\n",
    "def sis(G, seeds, beta, mu, steps, outfile):\n",
    "   with open(outfile, 'w') as f:\n",
    "      for step in range(steps):\n",
    "         seeds = s_to_i(G, seeds, beta) # S->I\n",
    "         seeds = i_to_s(seeds, mu)      # I->S\n",
    "         f.write(\"%d\\t%1.4f\\n\" % (step, len(seeds) / len(G.nodes)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1.2\n",
    "\n",
    "\"\"\"\n",
    "We want to start from ten random nodes, so we use the random.sample\n",
    "function over the list of nodes. Note that the sis function\n",
    "expects seeds to be a set, while random.sample returns a list!\n",
    "\"\"\"\n",
    "\n",
    "sis(G, set(random.sample(G.nodes, 10)), 0.75, 0.01, 100, \"Q1.2\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1.3\n",
    "\n",
    "\"\"\"\n",
    "We want the ten nodes with the largest degree. So we first sort\n",
    "the G.degree view, in reverse. Since it is a dictionary, we need\n",
    "to specify the sorting criterion (by value, thus x[1] for the key\n",
    "parameter of the sorted function -- check the documentation!).\n",
    "Sorted returns the list in ascending order, we need descending,\n",
    "so we reverse it.\n",
    "\"\"\"\n",
    "\n",
    "sorted_degree = sorted(G.degree, key = lambda x: x[1], reverse = True)\n",
    "\n",
    "# Then we take the top 10.\n",
    "hubs_degree = sorted_degree[:10]\n",
    "\n",
    "# Then we have to transform this list of (node, degre) tuple in a set of nodes, which is what the function sis wants.\n",
    "hubs = set([hub_degree[0] for hub_degree in hubs_degree])\n",
    "\n",
    "# Note: you could have written the previous three lines in a single one:\n",
    "# hubs = set([hub_degree[0] for hub_degree in sorted(G.degree, key = lambda x: x[1], reverse = True)[:10]])\n",
    "\n",
    "sis(G, hubs, 0.75, 0.01, 100, \"Q1.3\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nAnswer: The two spreading events differ when it comes to the\\ninitial speed. The hubs spread the disease more quickly -- for\\ninstance at the first step the infection rate is ~11%, against\\nthe random case of ~5%: more than double! However, there is no\\nappreciable difference in the endemic state: they both stabilize\\naround 60-63% of the total population. That is because, in a SIS\\nmodel, the endemic state only depends on lambda = beta / mu, not\\non the initial set of seeds.\\n'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 1.4\n",
    "\n",
    "\"\"\"\n",
    "Answer: The two spreading events differ when it comes to the\n",
    "initial speed. The hubs spread the disease more quickly -- for\n",
    "instance at the first step the infection rate is ~11%, against\n",
    "the random case of ~5%: more than double! However, there is no\n",
    "appreciable difference in the endemic state: they both stabilize\n",
    "around 60-63% of the total population. That is because, in a SIS\n",
    "model, the endemic state only depends on lambda = beta / mu, not\n",
    "on the initial set of seeds.\n",
    "\"\"\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.1\n",
    "\"\"\"\n",
    "We pick a random sample from G.edges. Note that the size of the\n",
    "sample (second parameter of the random.sample function) has to be\n",
    "an integer, thus we do a floor division (check the docs!). We\n",
    "then remove the test set from the graph: note that the networkx\n",
    "remove_edges_from function modifies your graph in place! If you\n",
    "don't want that, you have to make a copy first.\n",
    "\"\"\"\n",
    "G = nx.read_edgelist(\"Binary-GS.txt\", delimiter = \"\\t\")\n",
    "test_set = set(random.sample(G.edges, len(G.edges) // 10))\n",
    "G.remove_edges_from(test_set)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.2\n",
    "\"\"\"\n",
    "Taking this step by step. If we want to predict new connections,\n",
    "we have to iterate over non-connected node pairs. The easiest way\n",
    "to do it is to first generate all possible node pairs, and then\n",
    "remove the connected node pairs from this set. Then we iterate over\n",
    "the remaining set. Common neighbors wants us to know how many\n",
    "common neighbors the node pair has, which is the intersection of\n",
    "their neighbor set. We then sort our scores in descending order\n",
    "and take the top ones.\n",
    "\"\"\"\n",
    "def common_neighbor(G, size):\n",
    "   all_node_pairs = set(itertools.product(G.nodes, repeat = 2))\n",
    "   all_unconnected_node_pairs = all_node_pairs - set(G.edges)\n",
    "   score = {}\n",
    "   for node_pair in all_unconnected_node_pairs:\n",
    "      score[node_pair] = len(set(G.neighbors(node_pair[0])) & set(G.neighbors(node_pair[1])))\n",
    "   sorted_guesses = sorted(score.items(), key = lambda x: x[1], reverse = True)\n",
    "   sorted_best_guesses = sorted_guesses[:size]\n",
    "   return set([guess[0] for guess in sorted_best_guesses])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Common Neighbor: Precision 0.0210; Recall 0.1603\n",
      "Random: Precision 0.0000; Recall 0.0000\n"
     ]
    }
   ],
   "source": [
    "# 2.3\n",
    "\"\"\"\n",
    "Precision = how many times we were right over how many times we tried.\n",
    "Recall = the share of actual edges from the test set that we found.\n",
    "\"\"\"\n",
    "guesses = common_neighbor(G, 1000)\n",
    "precision = len(guesses & test_set) / len(guesses)\n",
    "recall = len(guesses & test_set) / len(test_set)\n",
    "print(\"Common Neighbor: Precision %1.4f; Recall %1.4f\" % (precision, recall))\n",
    "\n",
    "\"\"\"\n",
    "In a random guess we also go over all the unconnected node pairs, but\n",
    "we skip the scoring phase: we just pick up random pairs.\n",
    "\"\"\"\n",
    "def random_guess(G, size):\n",
    "   all_node_pairs = set(itertools.product(G.nodes, repeat = 2))\n",
    "   all_unconnected_node_pairs = all_node_pairs - set(G.edges)\n",
    "   return set(random.sample(all_unconnected_node_pairs, size))\n",
    "\n",
    "random_guesses = random_guess(G, 1000)\n",
    "random_precision = len(random_guesses & test_set) / len(random_guesses)\n",
    "random_recall = len(random_guesses & test_set) / len(test_set)\n",
    "print(\"Random: Precision %1.4f; Recall %1.4f\" % (random_precision, random_recall))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import random, itertools\n",
	"import networkx as nx\n",
	"\n",
	"# From http://interactome.dfci.harvard.edu/S_cerevisiae/download/Collins.txt\n",
	"G = nx.read_edgelist(\"collins.txt\", delimiter = \"\\t\")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# 1.1\n",
	"\"\"\"\n",
	"It is always a good idea to break complex problems into their\n",
	"simpler component. Since in SIS we need to perform multiple\n",
	"tasks let's have a function implementing each one of those\n",
	"tasks. First, we implement the S-I step. Given a graph G and\n",
	"a set of currently infected nodes (seeds), we iterate over\n",
	"their neighbors and we toss a coin: with probability beta we\n",
	"add the neighbor to the new infected pool. (This is how the\n",
	"threshold model works. If we were implementing cascade or\n",
	"percolation the function would be very different: you would\n",
	"have to iterate over the sane nodes, and count the number of\n",
	"infected neighbors). Note the first line: we need to copy the\n",
	"seed set, because we cannot change a set over which we're\n",
	"iterating.\n",
	"\"\"\"\n",
	"\n",
	"def s_to_i(G, seeds, beta):\n",
	" new_infected = seeds.copy()\n",
	" for seed in seeds:\n",
	" seed_neighbors = G.neighbors(seed)\n",
	" for neighbor in seed_neighbors:\n",
	" if random.random() < beta:\n",
	" new_infected.add(neighbor)\n",
	" return new_infected\n",
	"\n",
	"\"\"\"\n",
	"Now let's tackle the I-S step. This is simpler: we don't need\n",
	"the graph. Each node has a stochastic recovery probability, no\n",
	"matter the number of infected neighbors. So let's just iterate\n",
	"over the infected nodes, and toss the recovery coin: with\n",
	"probability mu the node recovers. Again, remember to first copy\n",
	"the seed set, since you're iterating over it. (Note, if this\n",
	"were a SIR model, this function would be different! You'll have\n",
	"to have a \"recovered\" node set, and if a node is there, then it\n",
	"cannot be infected any more. If this were an SI model, this\n",
	"function would not exist!)\n",
	"\"\"\"\n",
	"\n",
	"def i_to_s(seeds, mu):\n",
	" new_infected = seeds.copy()\n",
	" for seed in seeds:\n",
	" if random.random() < mu:\n",
	" new_infected.remove(seed)\n",
	" return new_infected\n",
	"\n",
	"\"\"\"\n",
	"The main function! Once the S-I and I-S steps are implemented,\n",
	"it is very simple: iterate for the required number of steps\n",
	"and call S-I and I-S in succession. Print the result.\n",
	"\"\"\"\n",
	"\n",
	"def sis(G, seeds, beta, mu, steps, outfile):\n",
	" with open(outfile, 'w') as f:\n",
	" for step in range(steps):\n",
	" seeds = s_to_i(G, seeds, beta) # S->I\n",
	" seeds = i_to_s(seeds, mu) # I->S\n",
	" f.write(\"%d\\t%1.4f\\n\" % (step, len(seeds) / len(G.nodes)))\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"# 1.2\n",
	"\n",
	"\"\"\"\n",
	"We want to start from ten random nodes, so we use the random.sample\n",
	"function over the list of nodes. Note that the sis function\n",
	"expects seeds to be a set, while random.sample returns a list!\n",
	"\"\"\"\n",
	"\n",
	"sis(G, set(random.sample(G.nodes, 10)), 0.75, 0.01, 100, \"Q1.2\")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"# 1.3\n",
	"\n",
	"\"\"\"\n",
	"We want the ten nodes with the largest degree. So we first sort\n",
	"the G.degree view, in reverse. Since it is a dictionary, we need\n",
	"to specify the sorting criterion (by value, thus x[1] for the key\n",
	"parameter of the sorted function -- check the documentation!).\n",
	"Sorted returns the list in ascending order, we need descending,\n",
	"so we reverse it.\n",
	"\"\"\"\n",
	"\n",
	"sorted_degree = sorted(G.degree, key = lambda x: x[1], reverse = True)\n",
	"\n",
	"# Then we take the top 10.\n",
	"hubs_degree = sorted_degree[:10]\n",
	"\n",
	"# Then we have to transform this list of (node, degre) tuple in a set of nodes, which is what the function sis wants.\n",
	"hubs = set([hub_degree[0] for hub_degree in hubs_degree])\n",
	"\n",
	"# Note: you could have written the previous three lines in a single one:\n",
	"# hubs = set([hub_degree[0] for hub_degree in sorted(G.degree, key = lambda x: x[1], reverse = True)[:10]])\n",
	"\n",
	"sis(G, hubs, 0.75, 0.01, 100, \"Q1.3\")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'\\nAnswer: The two spreading events differ when it comes to the\\ninitial speed. The hubs spread the disease more quickly -- for\\ninstance at the first step the infection rate is ~11%, against\\nthe random case of ~5%: more than double! However, there is no\\nappreciable difference in the endemic state: they both stabilize\\naround 60-63% of the total population. That is because, in a SIS\\nmodel, the endemic state only depends on lambda = beta / mu, not\\non the initial set of seeds.\\n'"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 1.4\n",
	"\n",
	"\"\"\"\n",
	"Answer: The two spreading events differ when it comes to the\n",
	"initial speed. The hubs spread the disease more quickly -- for\n",
	"instance at the first step the infection rate is ~11%, against\n",
	"the random case of ~5%: more than double! However, there is no\n",
	"appreciable difference in the endemic state: they both stabilize\n",
	"around 60-63% of the total population. That is because, in a SIS\n",
	"model, the endemic state only depends on lambda = beta / mu, not\n",
	"on the initial set of seeds.\n",
	"\"\"\"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"# 2.1\n",
	"\"\"\"\n",
	"We pick a random sample from G.edges. Note that the size of the\n",
	"sample (second parameter of the random.sample function) has to be\n",
	"an integer, thus we do a floor division (check the docs!). We\n",
	"then remove the test set from the graph: note that the networkx\n",
	"remove_edges_from function modifies your graph in place! If you\n",
	"don't want that, you have to make a copy first.\n",
	"\"\"\"\n",
	"G = nx.read_edgelist(\"Binary-GS.txt\", delimiter = \"\\t\")\n",
	"test_set = set(random.sample(G.edges, len(G.edges) // 10))\n",
	"G.remove_edges_from(test_set)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# 2.2\n",
	"\"\"\"\n",
	"Taking this step by step. If we want to predict new connections,\n",
	"we have to iterate over non-connected node pairs. The easiest way\n",
	"to do it is to first generate all possible node pairs, and then\n",
	"remove the connected node pairs from this set. Then we iterate over\n",
	"the remaining set. Common neighbors wants us to know how many\n",
	"common neighbors the node pair has, which is the intersection of\n",
	"their neighbor set. We then sort our scores in descending order\n",
	"and take the top ones.\n",
	"\"\"\"\n",
	"def common_neighbor(G, size):\n",
	" all_node_pairs = set(itertools.product(G.nodes, repeat = 2))\n",
	" all_unconnected_node_pairs = all_node_pairs - set(G.edges)\n",
	" score = {}\n",
	" for node_pair in all_unconnected_node_pairs:\n",
	" score[node_pair] = len(set(G.neighbors(node_pair[0])) & set(G.neighbors(node_pair[1])))\n",
	" sorted_guesses = sorted(score.items(), key = lambda x: x[1], reverse = True)\n",
	" sorted_best_guesses = sorted_guesses[:size]\n",
	" return set([guess[0] for guess in sorted_best_guesses])\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Common Neighbor: Precision 0.0210; Recall 0.1603\n",
	"Random: Precision 0.0000; Recall 0.0000\n"
	]
	}
	],
	"source": [
	"# 2.3\n",
	"\"\"\"\n",
	"Precision = how many times we were right over how many times we tried.\n",
	"Recall = the share of actual edges from the test set that we found.\n",
	"\"\"\"\n",
	"guesses = common_neighbor(G, 1000)\n",
	"precision = len(guesses & test_set) / len(guesses)\n",
	"recall = len(guesses & test_set) / len(test_set)\n",
	"print(\"Common Neighbor: Precision %1.4f; Recall %1.4f\" % (precision, recall))\n",
	"\n",
	"\"\"\"\n",
	"In a random guess we also go over all the unconnected node pairs, but\n",
	"we skip the scoring phase: we just pick up random pairs.\n",
	"\"\"\"\n",
	"def random_guess(G, size):\n",
	" all_node_pairs = set(itertools.product(G.nodes, repeat = 2))\n",
	" all_unconnected_node_pairs = all_node_pairs - set(G.edges)\n",
	" return set(random.sample(all_unconnected_node_pairs, size))\n",
	"\n",
	"random_guesses = random_guess(G, 1000)\n",
	"random_precision = len(random_guesses & test_set) / len(random_guesses)\n",
	"random_recall = len(random_guesses & test_set) / len(test_set)\n",
	"print(\"Random: Precision %1.4f; Recall %1.4f\" % (random_precision, random_recall))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}