Forked from stpnk/pandas_get_rid_of_loops_time.ipynb
Last active
June 1, 2019 21:41
-
-
Save VasilijKolomiets/4a773f10ae0bb0e6f561b7abf0fa5f2e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 110, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_suspect = pd.DataFrame({\"quantity\":[0,0,0]}, index = pd.Series(['X01', 'X03', 'X04'], name = 'id') )\n", | |
"df_work = pd.DataFrame({\"tr_id\":[1,1,1,1,2,2,3,3,3],\n", | |
" \"id\":[\"X03\",\"X02\",\"X03\",\"X03\",'X01','X01','X04','X04','X04'], \n", | |
" \"reason\":['P','P','Q','I',\"Q\",\"I\",\"I\",\"P\",\"Q\"], \n", | |
" \"status\":[\"BROCKEN\"]+5*[\"GOOD\"] +[\"TO_REPAIR\"] + [\"BROCKEN\"]+[\"GOOD\"] ,\n", | |
" \"case\":9*[\"\"], \n", | |
" \"quantity\":[-1,-1,2,2,-1,-1,1,2,-3,]})\n", | |
"\n", | |
"\n", | |
"rooles = {(\"Q\",\"GOOD\"):[(\"P\",\"BROCKEN\"),\n", | |
" (\"I\",\"TO_REPAIR\"),\n", | |
" ],\n", | |
" (\"P\",\"REPAIRED\"):[(\"Q\",\"LOST\"),\n", | |
" (\"I\",\"TO_REPAIR\"),\n", | |
" ],\n", | |
" }\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_work" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"144\n" | |
] | |
} | |
], | |
"source": [ | |
"n=4\n", | |
"for i in range(n):\n", | |
" df_work = df_work.append(df_work, ignore_index=True)\n", | |
"print(len(df_work))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_work.head(30)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"792 ms ± 33.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"tr_ids = set(df_work[\"tr_id\"]) #вибираємо номера документів з декількома транзакціями\n", | |
"for tr_id in tr_ids:\n", | |
" df_transaction = df_work.query(\"tr_id==@tr_id\") # group_by?\n", | |
"# df_transaction = df_work[df_work[\"tr_id\"] == tr_id]\n", | |
" for row in df_transaction.itertuples():\n", | |
" for roole, r_list in rooles.items():\n", | |
" if (row.reason, row.status) == roole:\n", | |
" df_transaction_one_id = df_transaction.query(\"id == @row.id\") # group_by?\n", | |
" sum_suspect =0\n", | |
" for r_roole in r_list: \n", | |
" for r_row in df_transaction_one_id.itertuples():\n", | |
" if (r_row.reason, r_row.status) == r_roole:\n", | |
" sum_suspect += r_row.quantity # group_by?\n", | |
" df_work.loc[r_row.Index, \"case\"] = str(roole)+\":\"+str(r_roole)\n", | |
" if sum_suspect:\n", | |
" df_suspect.loc[row.id,\"quantity\"] += sum_suspect\n", | |
"df_rez = df_work.query(\"case != ''\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_rez" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_suspect = pd.DataFrame({\"quantity\":[0,0,0]}, index = pd.Series(['X01', 'X03', 'X04'], name = 'id') )\n", | |
"df_work = pd.DataFrame({\"tr_id\":[1,1,1,1,2,2,3,3,3],\n", | |
" \"id\":[\"X03\",\"X02\",\"X03\",\"X03\",'X01','X01','X04','X04','X04'], \n", | |
" \"reason\":['P','P','Q','I',\"Q\",\"I\",\"I\",\"P\",\"Q\"], \n", | |
" \"status\":[\"BROKEN\"]+5*[\"GOOD\"] +[\"TO_REPAIR\"] + [\"BROKEN\"]+[\"GOOD\"] ,\n", | |
" \"case\":9*[\"\"], \n", | |
" \"quantity\":[-1,-1,2,2,-1,-1,1,2,-3,]})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 115, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"144\n" | |
] | |
} | |
], | |
"source": [ | |
"n=4\n", | |
"for i in range(n):\n", | |
" df_work = df_work.append(df_work, ignore_index=True)\n", | |
"print(len(df_work))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_work" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"39.8 ms ± 4.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"def generate_case(g):\n", | |
" \"\"\"Функция для заполнения поля case\n", | |
" g - группа записей в виде датафрейма\n", | |
" \"\"\"\n", | |
"\n", | |
" # Я немного изменил словарь с правилами.\n", | |
" # Посчитал, что будет проще работать со\n", | |
" # строками в качестве ключей и значений, \n", | |
" # чем с кортежами.\n", | |
" rules = {\n", | |
" \"Q-GOOD\":[\n", | |
" \"P-BROKEN\",\n", | |
" \"I-TO_REPAIR\",\n", | |
" ],\n", | |
" \"P-REPAIRED\":[\n", | |
" \"Q-LOST\",\n", | |
" \"I-TO_REPAIR\",\n", | |
" ],\n", | |
" }\n", | |
"\n", | |
" # Предварительно заполняем столбец case\n", | |
" # значениями \"<reason>-<status>\".\n", | |
" g.case = g.reason.str.cat(g.status, sep='-')\n", | |
"\n", | |
" # Проверяем есть ли значения из case\n", | |
" # среди ключей в словаре правил.\n", | |
" # Если найдено хотя бы одно совпадение (.any()),\n", | |
" # то продолжаем работать с этой группой.\n", | |
" if g.case.isin(rules).any():\n", | |
"\n", | |
" # Мы знаем, что в группе записей для одного товара\n", | |
" # в пределах одного документа может быть только\n", | |
" # одна запись с хорошим состоянием.\n", | |
" # Получим значение.\n", | |
" good_condition = g[g.case.isin(rules)].case.values[0]\n", | |
" #good_condition = g.query('case in @rules').case.values[0]\n", | |
"\n", | |
" # Достаем из словаря правил список подозрительных\n", | |
" # значений для данного хорошего состояния.\n", | |
" suspicious_conditions = rules[good_condition]\n", | |
"\n", | |
" # Перезаполняем case.\n", | |
" # Если значение ячейки case есть в списке\n", | |
" # подозрительных проводок для данного ключа, то\n", | |
" # заполняем case - \"<good_condition>:<suspicious_condition>\",\n", | |
" # иначе записываем nan.\n", | |
" g.case = np.where(g.case.isin(suspicious_conditions),\n", | |
" good_condition + ':' + g.case,\n", | |
" np.nan)\n", | |
"\n", | |
" else:\n", | |
" g.case = np.nan\n", | |
"\n", | |
" return g\n", | |
"\n", | |
"df_rez = (df_work.groupby(['tr_id', 'id']) # Группируем по id документа и id товара. \n", | |
" \n", | |
" # Применяем функцию для заполнения case\n", | |
" # Будут заполнены только ячейки, удовлетворяющие правилам.\n", | |
" .apply(generate_case)\n", | |
" \n", | |
" # Выбрасываем записи с пустым полем case\n", | |
" .dropna(subset=['case']))\n", | |
"\n", | |
"# Берем из df_work столбцы 'id' и 'quantity'.\n", | |
"# Группируем по id товара и суммируем quantity\n", | |
"df_quantity_sum = (df_rez.loc[:, ['id', 'quantity']]\n", | |
" .set_index('id')\n", | |
" .groupby('id')\n", | |
" .sum())\n", | |
"\n", | |
"# Суммируем получившийся датафрейм с df_suspect,\n", | |
"# который создали ранее. \n", | |
"# Пустые значения меняем на 0.\n", | |
"# Конвертируем в int.\n", | |
"df_suspect_rez = (df_suspect + df_quantity_sum).fillna(0).astype(int)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_rez" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"help(df_work.append)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_suspect_rez" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment