Skip to content

Instantly share code, notes, and snippets.

@epifanio
Created November 14, 2019 12:50
Show Gist options
  • Save epifanio/50ae3faa106940b941fac42c45812f3c to your computer and use it in GitHub Desktop.
Save epifanio/50ae3faa106940b941fac42c45812f3c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from fuzzyutil import *"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('names_v4.csv', encoding='latin1')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Taxonomy</th>\n",
" <th>freq</th>\n",
" <th>last_seen</th>\n",
" <th>To_name</th>\n",
" <th>Status</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Epizooanthidae</td>\n",
" <td>1</td>\n",
" <td>107</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Lebensspuren (echinoderm bulldozing tracks)</td>\n",
" <td>1</td>\n",
" <td>112</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Paguridae</td>\n",
" <td>1</td>\n",
" <td>112</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Porifera, apricot incrusting</td>\n",
" <td>1</td>\n",
" <td>161</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Porifera, many osculi</td>\n",
" <td>1</td>\n",
" <td>161</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Taxonomy freq last_seen \\\n",
"0 1 Epizooanthidae 1 107 \n",
"1 2 Lebensspuren (echinoderm bulldozing tracks) 1 112 \n",
"2 3 Paguridae 1 112 \n",
"3 4 Porifera, apricot incrusting 1 161 \n",
"4 5 Porifera, many osculi 1 161 \n",
"\n",
" To_name Status \n",
"0 NaN False \n",
"1 NaN False \n",
"2 NaN False \n",
"3 NaN False \n",
"4 NaN False "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8289"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"allspecies = df['Taxonomy'][df['Status']==False]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8073"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(allspecies)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reorder list of elements according to their relative distance"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"results = similorder(allspecies, scorelimit=99, method='token_set', verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"100"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(results)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"toadj = getcorrections(results)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "82cd5836db6d42ee9f996c3c9128b6d4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"toadj"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"7448"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"corrected = toadj.get_changed_df()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"corrected"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<function fuzzyutil.getcorrections(toadjust, to_correct='to_correct', corrected='corrected')>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"getcorrections"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'list' object has no attribute 'to_csv'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-18-c18cead26d21>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'names_v4_false_ordered.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'to_csv'"
]
}
],
"source": [
"results.to_csv('names_v4_false_ordered.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"myresults = pd.DataFrame(results)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"myresults.to_csv('myresults.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Epizooanthidae\n",
"epizooanthidae on Paguridae\n",
"Paguridae with epizooanthidae\n",
"Paguridae like\n",
" Paguridae\n",
"Paguridae\n",
"Pandalidae\n",
"Pandalidae red\n",
"pandalidae red\n",
"Crangonidae/Pandalidae\n"
]
}
],
"source": [
"!head myresults.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment