Skip to content

Instantly share code, notes, and snippets.

@sdruskat
Last active June 1, 2023 15:36
Show Gist options
  • Save sdruskat/c61cb274945159c9e891a8f29aff450b to your computer and use it in GitHub Desktop.
Save sdruskat/c61cb274945159c9e891a8f29aff450b to your computer and use it in GitHub Desktop.
A Jupyter notebook showing slightly more complex stratified proportionate random sampling from a Dask dataset based on value counts.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>software</th>\n",
" <th>instance</th>\n",
" <th>data</th>\n",
" <th>mention_counts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>a</td>\n",
" <td>1</td>\n",
" <td>EINS</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>a</td>\n",
" <td>2</td>\n",
" <td>ZWEI</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>a</td>\n",
" <td>3</td>\n",
" <td>DREI</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>a</td>\n",
" <td>4</td>\n",
" <td>VIER</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>a</td>\n",
" <td>5</td>\n",
" <td>FÜNF</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>a</td>\n",
" <td>6</td>\n",
" <td>SECHS</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>a</td>\n",
" <td>7</td>\n",
" <td>SIEBEN</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>a</td>\n",
" <td>8</td>\n",
" <td>ACHT</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>b</td>\n",
" <td>9</td>\n",
" <td>NEUN</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>b</td>\n",
" <td>10</td>\n",
" <td>ZEHN</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>b</td>\n",
" <td>11</td>\n",
" <td>ELF</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>b</td>\n",
" <td>12</td>\n",
" <td>ZWÖLF</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>b</td>\n",
" <td>13</td>\n",
" <td>DREIZEHN</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>c</td>\n",
" <td>14</td>\n",
" <td>VIERZEHN</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>c</td>\n",
" <td>15</td>\n",
" <td>FÜNFZEHN</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>c</td>\n",
" <td>16</td>\n",
" <td>SECHZEHN</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>c</td>\n",
" <td>17</td>\n",
" <td>SIEBZEHN</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>d</td>\n",
" <td>18</td>\n",
" <td>ACHTZEHN</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>d</td>\n",
" <td>19</td>\n",
" <td>NEUNZEHN</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>d</td>\n",
" <td>20</td>\n",
" <td>ZWANZIG</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" software instance data mention_counts\n",
"0 a 1 EINS 8\n",
"1 a 2 ZWEI 8\n",
"2 a 3 DREI 8\n",
"3 a 4 VIER 8\n",
"4 a 5 FÜNF 8\n",
"5 a 6 SECHS 8\n",
"6 a 7 SIEBEN 8\n",
"7 a 8 ACHT 8\n",
"8 b 9 NEUN 5\n",
"9 b 10 ZEHN 5\n",
"10 b 11 ELF 5\n",
"11 b 12 ZWÖLF 5\n",
"12 b 13 DREIZEHN 5\n",
"13 c 14 VIERZEHN 4\n",
"14 c 15 FÜNFZEHN 4\n",
"15 c 16 SECHZEHN 4\n",
"16 c 17 SIEBZEHN 4\n",
"17 d 18 ACHTZEHN 3\n",
"18 d 19 NEUNZEHN 3\n",
"19 d 20 ZWANZIG 3"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create original dataframe (filtered_czi)\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import dask.dataframe as dd\n",
"import random\n",
"import matplotlib.pyplot as plt\n",
"ddf_filtered_czi = pd.DataFrame({\n",
" 'software': ['a','a','a','a','a','a','a','a','b','b','b','b','b','c','c','c','c','d','d','d'],\n",
" 'instance': [x for x in range(1, 21)],\n",
" 'data': ['EINS', 'ZWEI', 'DREI', 'VIER', 'FÜNF', 'SECHS', 'SIEBEN', 'ACHT', 'NEUN', 'ZEHN', \n",
" 'ELF', 'ZWÖLF', 'DREIZEHN', 'VIERZEHN', 'FÜNFZEHN', 'SECHZEHN', 'SIEBZEHN', 'ACHTZEHN', 'NEUNZEHN', 'ZWANZIG']\n",
"})\n",
"# test_df.set_index('software', inplace=True)\n",
"ddf_filtered_czi = dd.from_pandas(ddf_filtered_czi, npartitions=1)\n",
"\n",
"# Add a column that contains the mention counts for each software\n",
"ddf_filtered_czi['mention_counts'] = ddf_filtered_czi['software'].map(ddf_filtered_czi['software'].value_counts())\n",
"ddf_filtered_czi.compute()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAi4AAAGdCAYAAAA1/PiZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAWpklEQVR4nO3df6xXBf348dcF4qrJvYkIdAXEzLRLCgUXQmvJusbuDB1NM2d1w821xdK66we0CVka+EeMVu+BP6boWupcyVrOH4PVWEvHBYU0poj5UQIFbXav3NZl3vv+/tG8fQkNxPe9576Oj8f2/uOc9+W8X/egvJ877/M+p65arVYDACCBEUUPAABwtIQLAJCGcAEA0hAuAEAawgUASEO4AABpCBcAIA3hAgCkMaroAWqtv78/9u7dG2PGjIm6urqixwEAjkK1Wo3XX389mpqaYsSItz+uUrpw2bt3b0yePLnoMQCAY7B79+6YNGnS2z5funAZM2ZMRPz7F29oaCh4GgDgaHR3d8fkyZMH3sffTunC5c2PhxoaGoQLACRzpNM8nJwLAKQhXACANIQLAJCGcAEA0hAuAEAawgUASEO4AABpCBcAII1hGS4LFy6Mk046KS699NKiRwEAhpFhGS7XXntt3HXXXUWPAQAMM8MyXC644IIj3qsAAHjvqXm4bNq0KRYsWBBNTU1RV1cX69evP+xnKpVKTJ06NY477riYM2dObN68udZjAAAlVPNw6enpienTp0elUnnL5++9997o6OiI5cuXx+OPPx7Tp0+P+fPnx/79+4/p9Xp7e6O7u/uQBwBQTjUPl7a2trjhhhti4cKFb/n8qlWr4uqrr45FixZFc3NzrF27Nk444YS4/fbbj+n1VqxYEY2NjQOPyZMnv5vxAYBhbEjPcTl48GBs3bo1Wltb/zPAiBHR2toajz766DFtc+nSpdHV1TXw2L17d63GBQCGmVFD+WKvvvpq9PX1xYQJEw5ZP2HChHj66acHlltbW2P79u3R09MTkyZNivvuuy/mzp37ltusr6+P+vr6QZ0bABgehjRcjtaGDRuKHgEAGIaG9KOicePGxciRI2Pfvn2HrN+3b19MnDhxKEcBABIa0nAZPXp0zJw5MzZu3Diwrr+/PzZu3Pi2HwUBALyp5h8VHThwIHbt2jWw/Pzzz8e2bdti7NixMWXKlOjo6Ij29vaYNWtWzJ49O1avXh09PT2xaNGiWo8CAJRMzcNly5YtMW/evIHljo6OiIhob2+PdevWxeWXXx6vvPJKLFu2LF5++eWYMWNGPPTQQ4edsAsA8N/qqtVqteghaqFSqUSlUom+vr7YuXNndHV1RUNDQ9FjAQBHobu7OxobG4/4/l2acHnT0f7iAMDwcbTv38PyJosAAG9FuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSKE24VCqVaG5ujpaWlqJHAQAGiQvQAQCFcwE6AKB0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGkIFwAgDeECAKRRmnBx5VwAKD9XzgUACufKuQBA6QgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGmUJlxc8h8Ays8l/wGAwrnkPwBQOsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGmUJlzcZBEAys9NFgGAwrnJIgBQOsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGkIFwAgDeECAKQhXACANEoTLpVKJZqbm6OlpaXoUQCAQVJXrVarRQ9RS93d3dHY2BhdXV3R0NBQ9DgAwFE42vfv0hxxAQDKT7gAAGkIFwAgDeECAKQhXACANIQLAJCGcAEA0hAuAEAawgUASEO4AABpCBcAIA3hAgCkIVwAgDSECwCQhnABANIQLgBAGsIFAEhDuAAAaQgXACCN0oRLpVKJ5ubmaGlpKXoUAGCQ1FWr1WrRQ9RSd3d3NDY2RldXVzQ0NBQ9DgBwFI72/bs0R1wAgPITLgBAGsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGkIFwAgDeECAKQhXACANIQLAJCGcAEA0hAuAEAawgUASEO4AABpCBcAIA3hAgCkIVwAgDSECwCQhnABANIQLgBAGsIFAEhDuAAAaZQmXCqVSjQ3N0dLS0vRowAAg6SuWq1Wix6ilrq7u6OxsTG6urqioaGh6HEAgKNwtO/fpTniAgCUn3ABANIQLgBAGsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGkIFwAgDeECAKQhXACANIQLAJCGcAEA0hAuAEAawgUASEO4AABpCBcAIA3hAgCkIVwAgDSECwCQhnABANIQLgBAGsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGkIFwAgDeECAKQhXACANIQLAJCGcAEA0hAuAEAawzJcfve738VZZ50VZ555Ztx2221FjwMADBOjih7gv73xxhvR0dERv//976OxsTFmzpwZCxcujJNPPrno0QCAgg27Iy6bN2+OadOmxamnnhonnnhitLW1xSOPPFL0WADAMFDzcNm0aVMsWLAgmpqaoq6uLtavX3/Yz1QqlZg6dWocd9xxMWfOnNi8efPAc3v37o1TTz11YPnUU0+NPXv21HpMACChmodLT09PTJ8+PSqVyls+f++990ZHR0csX748Hn/88Zg+fXrMnz8/9u/ff0yv19vbG93d3Yc8AIByqnm4tLW1xQ033BALFy58y+dXrVoVV199dSxatCiam5tj7dq1ccIJJ8Ttt98eERFNTU2HHGHZs2dPNDU1ve3rrVixIhobGwcekydPru0v9P+ZuuSBQds2AHBkQ3qOy8GDB2Pr1q3R2tr6nwFGjIjW1tZ49NFHIyJi9uzZ8dRTT8WePXviwIED8eCDD8b8+fPfdptLly6Nrq6ugcfu3bsH/fcAAIoxpN8qevXVV6Ovry8mTJhwyPoJEybE008//e+BRo2Kn/70pzFv3rzo7++P733ve//zG0X19fVRX18/qHMDAMPDsPs6dETExRdfHBdffHHRYwAAw8yQflQ0bty4GDlyZOzbt++Q9fv27YuJEycO5SgAQEJDGi6jR4+OmTNnxsaNGwfW9ff3x8aNG2Pu3LlDOQoAkFDNPyo6cOBA7Nq1a2D5+eefj23btsXYsWNjypQp0dHREe3t7TFr1qyYPXt2rF69Onp6emLRokW1HgUAKJmah8uWLVti3rx5A8sdHR0REdHe3h7r1q2Lyy+/PF555ZVYtmxZvPzyyzFjxox46KGHDjthFwDgv9VVq9Vq0UPUQqVSiUqlEn19fbFz587o6uqKhoaGmr7G1CUPxP+tvKim2wQAIrq7u6OxsfGI79/D7l5Fx2rx4sWxY8eO6OzsLHoUAGCQlCZcAIDyEy4AQBrCBQBIQ7gAAGkIFwAgDeECAKQhXACANEoTLpVKJZqbm6OlpaXoUQCAQVKacHEBOgAov9KECwBQfsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSKE24uHIuAJRfacLFlXMBoPxKEy4AQPkJFwAgDeECAKQhXACANIQLAJCGcAEA0hAuAEAawgUASEO4AABplCZcXPIfAMqvNOHikv8AUH6lCRcAoPyECwCQhnABANIQLgBAGsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSKE24uMkiAJRfacLFTRYBoPxKEy4AQPkJFwAgDeECAKQhXACANIQLAJCGcAEA0hAuAEAawgUASEO4AABpCBcAIA3hAgCkIVwAgDSECwCQhnABANIoTbhUKpVobm6OlpaWokcBAAZJacJl8eLFsWPHjujs7Cx6FABgkJQmXACA8hMuAEAawgUASEO4AABpCBcAIA3hAgCkIVwAgDSECwCQhnABANIQLgBAGsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIozThUqlUorm5OVpaWooeBQAYJKUJl8WLF8eOHTuis7Oz6FEAgEFSmnABAMpPuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGkIFwAgDeECAKQhXACANIQLAJCGcAEA0hAuAEAawgUASEO4AABpCBcAIA3hAgCkIVwAgDSECwCQhnABANIQLgBAGsIFAEhDuAAAaQgXACAN4QIApFGacKlUKtHc3BwtLS1FjwIADJLShMvixYtjx44d0dnZWfQoAMAgKU24AADlJ1wAgDSECwCQhnABANIQLgBAGsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGkIFwAgDeECAKQhXACANIQLAJCGcAEA0hAuAEAawgUASEO4AABpCBcAIA3hAgCkIVwAgDSECwCQhnABANIQLgBAGsIFAEhDuAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGkIFwAgDeECAKQhXACANIQLAJDGsAyXhQsXxkknnRSXXnpp0aMAAMPIsAyXa6+9Nu66666ixwAAhplhGS4XXHBBjBkzpugxAIBh5h2Hy6ZNm2LBggXR1NQUdXV1sX79+sN+plKpxNSpU+O4446LOXPmxObNm2sxKwDwHjfqnf6Bnp6emD59elx11VXxhS984bDn77333ujo6Ii1a9fGnDlzYvXq1TF//vx45plnYvz48RERMWPGjHjjjTcO+7OPPPJINDU1vaN5ent7o7e3d2C5u7v7Hf5GAEAW7zhc2traoq2t7W2fX7VqVVx99dWxaNGiiIhYu3ZtPPDAA3H77bfHkiVLIiJi27ZtxzbtW1ixYkVcf/31NdseADB81fQcl4MHD8bWrVujtbX1Py8wYkS0trbGo48+WsuXGrB06dLo6uoaeOzevXtQXgcAKN47PuLyv7z66qvR19cXEyZMOGT9hAkT4umnnz7q7bS2tsb27dujp6cnJk2aFPfdd1/MnTv3LX+2vr4+6uvr39XcAEAONQ2XWtmwYUPRIwAAw1BNPyoaN25cjBw5Mvbt23fI+n379sXEiRNr+VIAwHtQTcNl9OjRMXPmzNi4cePAuv7+/ti4cePbftQDAHC03vFHRQcOHIhdu3YNLD///POxbdu2GDt2bEyZMiU6Ojqivb09Zs2aFbNnz47Vq1dHT0/PwLeMAACO1TsOly1btsS8efMGljs6OiIior29PdatWxeXX355vPLKK7Fs2bJ4+eWXY8aMGfHQQw8ddsIuAMA7VVetVqtFD1ELlUolKpVK9PX1xc6dO6OrqysaGhpq+hpTlzwQ/7fyoppuEwD49wVkGxsbj/j+PSzvVXQsFi9eHDt27IjOzs6iRwEABklpwgUAKD/hAgCkIVwAgDSECwCQhnABANIQLgBAGsPyJovvxpuXpenu7q75tvt7/zko2wWA97o331+PdHm50l2A7uDBg/Hcc88VPQ4AcAx2794dkyZNetvnSxMub+rv74+9e/fGmDFjoq6urmbb7e7ujsmTJ8fu3btrfkVeDmVfDw37eWjYz0PDfh4ag7mfq9VqvP7669HU1BQjRrz9mSyl+6hoxIgR/7PU3q2Ghgb/UwwR+3po2M9Dw34eGvbz0Bis/dzY2HjEn3FyLgCQhnABANIQLkepvr4+li9fHvX19UWPUnr29dCwn4eG/Tw07OehMRz2c+lOzgUAyssRFwAgDeECAKQhXACANIQLAJCGcDmCNWvWxLnnnjtwsZ25c+fGgw8+WPRYpbdy5cqoq6uLb33rW0WPUjo//OEPo66u7pDH2WefXfRYpbRnz5748pe/HCeffHIcf/zxcc4558SWLVuKHqtUpk6deth/z3V1dbF48eKiRyuVvr6+uO666+L000+P448/Ps4444z48Y9/fMT7Cg2G0l05t9YmTZoUK1eujDPPPDOq1Wrceeedcckll8QTTzwR06ZNK3q8Uurs7Iybb745zj333KJHKa1p06bFhg0bBpZHjfJPQa299tprcf7558e8efPiwQcfjFNOOSWeffbZOOmkk4oerVQ6Ozujr69vYPmpp56KCy+8MC677LICpyqfm266KdasWRN33nlnTJs2LbZs2RKLFi2KxsbGuOaaa4Z0Fv9aHcGCBQsOWb7xxhtjzZo18dhjjwmXQXDgwIG48sor49Zbb40bbrih6HFKa9SoUTFx4sSixyi1m266KSZPnhx33HHHwLrTTz+9wInK6ZRTTjlkeeXKlXHGGWfEZz7zmYImKqc//elPcckll8RFF10UEf8+0nX33XfH5s2bh3wWHxW9A319fXHPPfdET09PzJ07t+hxSmnx4sVx0UUXRWtra9GjlNqzzz4bTU1N8aEPfSiuvPLKePHFF4seqXR++9vfxqxZs+Kyyy6L8ePHx8c//vG49dZbix6r1A4ePBi//OUv46qrrqrpTXaJOO+882Ljxo2xc+fOiIjYvn17/PGPf4y2trYhn8URl6Pw5JNPxty5c+Nf//pXnHjiiXH//fdHc3Nz0WOVzj333BOPP/54dHZ2Fj1Kqc2ZMyfWrVsXZ511Vrz00ktx/fXXx6c//el46qmnYsyYMUWPVxp//etfY82aNdHR0RE/+MEPorOzM6655poYPXp0tLe3Fz1eKa1fvz7+8Y9/xNe+9rWiRymdJUuWRHd3d5x99tkxcuTI6OvrixtvvDGuvPLKoR+myhH19vZWn3322eqWLVuqS5YsqY4bN676l7/8peixSuXFF1+sjh8/vrp9+/aBdZ/5zGeq1157bXFDvUe89tpr1YaGhuptt91W9Cil8r73va86d+7cQ9Z985vfrH7yk58saKLy+9znPlf9/Oc/X/QYpXT33XdXJ02aVL377rurf/7zn6t33XVXdezYsdV169YN+SyOuByF0aNHx4c//OGIiJg5c2Z0dnbGz372s7j55psLnqw8tm7dGvv3749PfOITA+v6+vpi06ZN8Ytf/CJ6e3tj5MiRBU5YXh/4wAfiIx/5SOzatavoUUrlgx/84GFHZj/60Y/Gr3/964ImKrcXXnghNmzYEL/5zW+KHqWUvvvd78aSJUviS1/6UkREnHPOOfHCCy/EihUrhvwIonA5Bv39/dHb21v0GKXy2c9+Np588slD1i1atCjOPvvs+P73vy9aBtGBAwfiueeei6985StFj1Iq559/fjzzzDOHrNu5c2ecdtppBU1UbnfccUeMHz9+4ORRauuf//xnjBhx6GmxI0eOjP7+/iGfRbgcwdKlS6OtrS2mTJkSr7/+evzqV7+KP/zhD/Hwww8XPVqpjBkzJj72sY8dsu79739/nHzyyYet5935zne+EwsWLIjTTjst9u7dG8uXL4+RI0fGFVdcUfRopfLtb387zjvvvPjJT34SX/ziF2Pz5s1xyy23xC233FL0aKXT398fd9xxR7S3t/tq/yBZsGBB3HjjjTFlypSYNm1aPPHEE7Fq1aq46qqrhnwWf8NHsH///vjqV78aL730UjQ2Nsa5554bDz/8cFx44YVFjwbH5G9/+1tcccUV8fe//z1OOeWU+NSnPhWPPfbYYV8r5d1paWmJ+++/P5YuXRo/+tGP4vTTT4/Vq1cXczJjyW3YsCFefPHFQt5E3yt+/vOfx3XXXRff+MY3Yv/+/dHU1BRf//rXY9myZUM+S121WsBl7wAAjoHruAAAaQgXACAN4QIApCFcAIA0hAsAkIZwAQDSEC4AQBrCBQBIQ7gAAGkIFwAgDeECAKQhXACANP4fUTSwJMxmWEoAAAAASUVORK5CYII=",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"software_series = ddf_filtered_czi.software\n",
"distinct_software_counts = software_series.value_counts()\n",
"\n",
"#This is what the complete data looks like on a log scale\n",
"\n",
"plt.hist(distinct_software_counts, bins=1000)\n",
"plt.yscale('log', nonpositive='clip')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>software</th>\n",
" <th>instance</th>\n",
" <th>data</th>\n",
" <th>mention_counts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>a</td>\n",
" <td>1</td>\n",
" <td>EINS</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>b</td>\n",
" <td>10</td>\n",
" <td>ZEHN</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>c</td>\n",
" <td>17</td>\n",
" <td>SIEBZEHN</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" software instance data mention_counts\n",
"2 a 1 EINS 8\n",
"0 b 10 ZEHN 5\n",
"1 c 17 SIEBZEHN 4"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Take stratified proportionate sample from original dataframe (n = 100000)\n",
"\n",
"N = 3\n",
"\n",
"_meta = {\"software\": \"object\",\n",
" \"instance\": \"i8\",\n",
" \"data\": \"str\",\n",
" \"mention_counts\": \"i8\"\n",
" }\n",
"\n",
"ddf_sample = ddf_filtered_czi.groupby('software', group_keys=False).apply(\n",
" lambda x: x.sample(\n",
" int(\n",
" np.rint(N * len(x) / ddf_filtered_czi.index.size.compute())\n",
" )\n",
" ),\n",
" meta=_meta\n",
").sample(frac=1).reset_index(drop=True)\n",
"\n",
"ddf_sample.compute().sort_values('software')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAicAAAGdCAYAAADJ6dNTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAUuUlEQVR4nO3df2zUd/3A8VcBW0bkTnGurKxkYTpipymRFVJngswSMheWkJiRzGBDsi3LWKI2cWFOVxfn9o8uJOZ08cdGTNyGM5OYiYjiFoxiCL+Mxm1KmIpDupFID6qO0X6+f5hV+Q6wh23vdeXxSO6P+9zneq++r+Pz3Oeu16aiKIoAAEhiWr0HAAD4T+IEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSmVHvAWo1MjISR44cidmzZ0dTU1O9xwEAxqAoijhx4kS0tbXFtGnnPzfScHFy5MiRaG9vr/cYAMAFOHz4cFxxxRXn3afh4mT27NkR8a9vrlQq1XkaAGAsqtVqtLe3jx7Hz6fh4uSNl3JKpZI4AYAGM5a3ZHhDLACQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBU6hInq1evjre//e3x0Y9+tB4PDwAkVpc4+cQnPhHf/va36/HQAEBydYmTD33oQ2P6bH0A4OJTc5zs3LkzVq1aFW1tbdHU1BRbtmx50z6VSiWuvPLKmDlzZixdujR27949HrMCABeBmuNkaGgoOjs7o1KpnPX2zZs3R19fX/T398e+ffuis7MzVq5cGa+88soFDfjaa69FtVo94wIATF01x8kNN9wQDzzwQKxevfqstz/88MNx2223xbp166KjoyMeeeSRmDVrVjz66KMXNOBDDz0U5XJ59NLe3n5BXwcAaAzj+p6TU6dOxd69e6Onp+ffDzBtWvT09MSuXbsu6Gvec889MTg4OHo5fPjweI0LACQ0Yzy/2LFjx2J4eDhaW1vP2N7a2hovvPDC6PWenp749a9/HUNDQ3HFFVfEU089Fd3d3Wf9mi0tLdHS0jKeYwIAiY1rnIzVT3/603o8LADQAMb1ZZ1LL700pk+fHgMDA2dsHxgYiLlz547nQwEAU9S4xklzc3MsXrw4duzYMbptZGQkduzYcc6XbQAA/lPNL+ucPHkyDh48OHr9pZdeigMHDsScOXNi/vz50dfXF729vXHttdfGkiVLYuPGjTE0NBTr1q0b18EBgKmp5jjZs2dPLF++fPR6X19fRET09vbGpk2bYs2aNfHqq6/GfffdF0ePHo1FixbFtm3b3vQmWQCAs2kqiqKo9xC1qFarUS6XY3BwMEqlUr3HAQDGoJbjd13+tg4AwLmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAqDRMnlUolOjo6oqurq96jAAATyIewAQATzoewAQANS5wAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFJpmDipVCrR0dERXV1d9R4FAJhATUVRFPUeohbVajXK5XIMDg5GqVSq9zgAwBjUcvxumDMnAMDFQZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqTRMnFQqlejo6Iiurq56jwIATKCmoiiKeg9Ri2q1GuVyOQYHB6NUKtV7HABgDGo5fjfMmRMA4OIgTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSaZg4qVQq0dHREV1dXfUeBQCYQE1FURT1HqIW1Wo1yuVyDA4ORqlUqvc4AMAY1HL8bpgzJwDAxUGcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASKVh4qRSqURHR0d0dXXVexQAYAI1FUVR1HuIWlSr1SiXyzE4OBilUqne4wAAY1DL8bthzpwAABcHcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCoNEyeVSiU6Ojqiq6ur3qMAABOoqSiKot5D1KJarUa5XI7BwcEolUr1HgcAGINajt8Nc+YEALg4iBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIpWHipFKpREdHR3R1ddV7FABgAjUVRVHUe4haVKvVKJfLMTg4GKVSqd7jAABjUMvxu2HOnAAAFwdxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACCVhomTSqUSHR0d0dXVVe9RAIAJ1FQURVHvIWpRrVajXC7H4OBglEqleo8DAIxBLcfvhjlzAgBcHMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq4gQASEWcAACpiBMAIBVxAgCkIk4AgFTECQCQijgBAFIRJwBAKuIEAEhFnAAAqYgTACAVcQIApCJOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKnUJU6eeeaZWLhwYbz73e+Ob37zm/UYAQBIasZkP+Dp06ejr68vnn322SiXy7F48eJYvXp1vOMd75jsUQCAhCb9zMnu3bvjmmuuiXnz5sVb3/rWuOGGG2L79u2TPQYAkFTNcbJz585YtWpVtLW1RVNTU2zZsuVN+1Qqlbjyyitj5syZsXTp0ti9e/fobUeOHIl58+aNXp83b168/PLLFzY9ADDl1BwnQ0ND0dnZGZVK5ay3b968Ofr6+qK/vz/27dsXnZ2dsXLlynjllVcuaMDXXnstqtXqGRcAYOqqOU5uuOGGeOCBB2L16tVnvf3hhx+O2267LdatWxcdHR3xyCOPxKxZs+LRRx+NiIi2trYzzpS8/PLL0dbWds7He+ihh6JcLo9e2tvbax0ZAGgg4/qek1OnTsXevXujp6fn3w8wbVr09PTErl27IiJiyZIl8dvf/jZefvnlOHnyZPzoRz+KlStXnvNr3nPPPTE4ODh6OXz48HiODAAkM66/rXPs2LEYHh6O1tbWM7a3trbGCy+88K8HnDEjvvzlL8fy5ctjZGQk7r777vP+pk5LS0u0tLSM55gAQGKT/qvEERE33XRT3HTTTfV4aAAguXF9WefSSy+N6dOnx8DAwBnbBwYGYu7cueP5UADAFDWucdLc3ByLFy+OHTt2jG4bGRmJHTt2RHd393g+FAAwRdX8ss7Jkyfj4MGDo9dfeumlOHDgQMyZMyfmz58ffX190dvbG9dee20sWbIkNm7cGENDQ7Fu3bpxHRwAmJpqjpM9e/bE8uXLR6/39fVFRERvb29s2rQp1qxZE6+++mrcd999cfTo0Vi0aFFs27btTW+SBQA4m6aiKIp6D1GLarUa5XI5BgcHo1Qq1XscAGAMajl+1+WvEgMAnIs4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKRSlz/8dyEqlUpUKpU4ffp0RPzr96UBgMbwxnF7LB+v1nAfwvaXv/wl2tvb6z0GAHABDh8+HFdcccV592m4OBkZGYkjR47E7Nmzo6mpqd7j1F21Wo329vY4fPiwT8ydQNZ5cljnyWGdJ4d1PlNRFHHixIloa2uLadPO/66ShnlZ5w3Tpk37r8V1MSqVSn74J4F1nhzWeXJY58lhnf+tXC6PaT9viAUAUhEnAEAq4qTBtbS0RH9/f7S0tNR7lCnNOk8O6zw5rPPksM4XruHeEAsATG3OnAAAqYgTACAVcQIApCJOAIBUxEkDqFQqceWVV8bMmTNj6dKlsXv37vPuf/z48Vi/fn1cfvnl0dLSEldffXVs3bp1kqZtXLWu88aNG2PhwoVxySWXRHt7e3zqU5+Kf/7zn5M0bWPauXNnrFq1Ktra2qKpqSm2bNnyX+/z3HPPxfvf//5oaWmJd73rXbFp06YJn7PR1brOTz/9dKxYsSLe+c53RqlUiu7u7vjxj388OcM2sAv5eX7DL37xi5gxY0YsWrRowuZrZOIkuc2bN0dfX1/09/fHvn37orOzM1auXBmvvPLKWfc/depUrFixIv74xz/G9773vXjxxRfjG9/4RsybN2+SJ28sta7z448/Hhs2bIj+/v54/vnn41vf+lZs3rw5PvOZz0zy5I1laGgoOjs7o1KpjGn/l156KW688cZYvnx5HDhwID75yU/Grbfe6sD5X9S6zjt37owVK1bE1q1bY+/evbF8+fJYtWpV7N+/f4InbWy1rvMbjh8/Hh//+Mfjwx/+8ARNNgUUpLZkyZJi/fr1o9eHh4eLtra24qGHHjrr/l/72teKBQsWFKdOnZqsEaeEWtd5/fr1xfXXX3/Gtr6+vuK6666b0Dmnkogovv/97593n7vvvru45pprzti2Zs2aYuXKlRM42dQylnU+m46OjuL+++8f/4GmqFrWec2aNcVnP/vZor+/v+js7JzQuRqVMyeJnTp1Kvbu3Rs9PT2j26ZNmxY9PT2xa9eus97nBz/4QXR3d8f69eujtbU13vve98aDDz4Yw8PDkzV2w7mQdf7ABz4Qe/fuHX3p59ChQ7F169b4yEc+MikzXyx27dp1xvMSEbFy5cpzPi+Mj5GRkThx4kTMmTOn3qNMOY899lgcOnQo+vv76z1Kag33h/8uJseOHYvh4eFobW09Y3tra2u88MILZ73PoUOH4mc/+1l87GMfi61bt8bBgwfjzjvvjNdff91/DOdwIet8yy23xLFjx+KDH/xgFEURp0+fjjvuuMPLOuPs6NGjZ31eqtVq/OMf/4hLLrmkTpNNbV/60pfi5MmTcfPNN9d7lCnlD3/4Q2zYsCF+/vOfx4wZDr/n48zJFDMyMhKXXXZZfP3rX4/FixfHmjVr4t57741HHnmk3qNNKc8991w8+OCD8dWvfjX27dsXTz/9dPzwhz+ML3zhC/UeDf4njz/+eNx///3x3e9+Ny677LJ6jzNlDA8Pxy233BL3339/XH311fUeJz3pltill14a06dPj4GBgTO2DwwMxNy5c896n8svvzze8pa3xPTp00e3vec974mjR4/GqVOnorm5eUJnbkQXss6f+9znYu3atXHrrbdGRMT73ve+GBoaittvvz3uvffemDZN94+HuXPnnvV5KZVKzppMgCeffDJuvfXWeOqpp970chr/mxMnTsSePXti//79cdddd0XEv/5nsiiKmDFjRmzfvj2uv/76Ok+Zh39BE2tubo7FixfHjh07RreNjIzEjh07oru7+6z3ue666+LgwYMxMjIyuu33v/99XH755cLkHC5knf/+97+/KUDeCMLCn6saN93d3Wc8LxERP/nJT875vHDhnnjiiVi3bl088cQTceONN9Z7nCmnVCrFb37zmzhw4MDo5Y477oiFCxfGgQMHYunSpfUeMZc6vyGX/+LJJ58sWlpaik2bNhW/+93vittvv71429veVhw9erQoiqJYu3ZtsWHDhtH9//znPxezZ88u7rrrruLFF18snnnmmeKyyy4rHnjggXp9Cw2h1nXu7+8vZs+eXTzxxBPFoUOHiu3btxdXXXVVcfPNN9frW2gIJ06cKPbv31/s37+/iIji4YcfLvbv31/86U9/KoqiKDZs2FCsXbt2dP9Dhw4Vs2bNKj796U8Xzz//fFGpVIrp06cX27Ztq9e30BBqXefvfOc7xYwZM4pKpVL89a9/Hb0cP368Xt9CQ6h1nf8/v61zbuKkAXzlK18p5s+fXzQ3NxdLliwpfvWrX43etmzZsqK3t/eM/X/5y18WS5cuLVpaWooFCxYUX/ziF4vTp09P8tSNp5Z1fv3114vPf/7zxVVXXVXMnDmzaG9vL+68887ib3/72+QP3kCeffbZIiLedHljbXt7e4tly5a96T6LFi0qmpubiwULFhSPPfbYpM/daGpd52XLlp13f87uQn6e/5M4ObemonAOGgDIw3tOAIBUxAkAkIo4AQBSEScAQCriBABIRZwAAKmIEwAgFXECAKQiTgCAVMQJAJCKOAEAUhEnAEAq/wcuMb9jfLy6LAAAAABJRU5ErkJggg==",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"software_series_sample = ddf_sample.software\n",
"distinct_software_sample_counts = software_series_sample.value_counts()\n",
"\n",
"#This is what our data looks like on a log scale\n",
"\n",
"plt.hist(distinct_software_sample_counts, bins=1000)\n",
"plt.yscale('log', nonpositive='clip')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 b\n",
"1 c\n",
"2 a\n",
"Name: software, dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get the series of unique software in the sample\n",
"\n",
"series = ddf_sample.software.unique()\n",
"series.compute()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>software</th>\n",
" <th>instance</th>\n",
" <th>data</th>\n",
" <th>mention_counts</th>\n",
" </tr>\n",
" <tr>\n",
" <th>software</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <th>7</th>\n",
" <td>a</td>\n",
" <td>8</td>\n",
" <td>ACHT</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <th>9</th>\n",
" <td>b</td>\n",
" <td>10</td>\n",
" <td>ZEHN</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <th>14</th>\n",
" <td>c</td>\n",
" <td>15</td>\n",
" <td>FÜNFZEHN</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <th>18</th>\n",
" <td>d</td>\n",
" <td>19</td>\n",
" <td>NEUNZEHN</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" software instance data mention_counts\n",
"software \n",
"a 7 a 8 ACHT 8\n",
"b 9 b 10 ZEHN 5\n",
"c 14 c 15 FÜNFZEHN 4\n",
"d 18 d 19 NEUNZEHN 3"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Take exactly one sample per software from the original dataframe\n",
"\n",
"grouped = ddf_filtered_czi.groupby(['software'])\n",
"grouped_sample = grouped.apply(lambda x: x.sample(1), meta=_meta)\n",
"dfg = grouped_sample.compute()\n",
"dfg"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>software</th>\n",
" <th>instance</th>\n",
" <th>data</th>\n",
" <th>mention_counts</th>\n",
" </tr>\n",
" <tr>\n",
" <th>software</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <th>7</th>\n",
" <td>a</td>\n",
" <td>8</td>\n",
" <td>ACHT</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <th>9</th>\n",
" <td>b</td>\n",
" <td>10</td>\n",
" <td>ZEHN</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <th>14</th>\n",
" <td>c</td>\n",
" <td>15</td>\n",
" <td>FÜNFZEHN</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" software instance data mention_counts\n",
"software \n",
"a 7 a 8 ACHT 8\n",
"b 9 b 10 ZEHN 5\n",
"c 14 c 15 FÜNFZEHN 4"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a new dataframe that contains the one sample from the original dataframe for the software that is in the series\n",
"\n",
"df = dfg[dfg['software'].isin(series)]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0rc1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment