epifanio/species clean 0 custom names.ipynb

## species clean 0 custom names.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Custom names Part 1: Undecided identifications\n",
    "These are limited sets of species-level taxa that are hard to tell apart on footage. They can be standardized because they are recurrent (e.g. \"Geodia/Stelleta\", \"Lycodonus/Lycenchelis\"). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fuzzyutil import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('names_v0.csv', encoding='latin1')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Taxonomy</th>\n",
       "      <th>freq</th>\n",
       "      <th>last_seen</th>\n",
       "      <th>To_name</th>\n",
       "      <th>Status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Epizooanthidae</td>\n",
       "      <td>1</td>\n",
       "      <td>107</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lebensspuren (echinoderm bulldozing tracks)</td>\n",
       "      <td>1</td>\n",
       "      <td>112</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Paguridae</td>\n",
       "      <td>1</td>\n",
       "      <td>112</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Porifera, apricot incrusting</td>\n",
       "      <td>1</td>\n",
       "      <td>161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Porifera, many osculi</td>\n",
       "      <td>1</td>\n",
       "      <td>161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0                                      Taxonomy  freq  last_seen  \\\n",
       "0           1                                Epizooanthidae     1        107   \n",
       "1           2   Lebensspuren (echinoderm bulldozing tracks)     1        112   \n",
       "2           3                                     Paguridae     1        112   \n",
       "3           4                  Porifera, apricot incrusting     1        161   \n",
       "4           5                         Porifera, many osculi     1        161   \n",
       "\n",
       "   To_name  Status  \n",
       "0      NaN   False  \n",
       "1      NaN   False  \n",
       "2      NaN   False  \n",
       "3      NaN   False  \n",
       "4      NaN   False  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8289"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Subset of candidate names\n",
    "Names containing \"/\".\n",
    "\n",
    "Repeat from here, after changing the score limit and/or method below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "undecided = df['Taxonomy'][((df['Taxonomy'].str.contains('/'))) & (df['Status']==False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "906"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(undecided)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remove the unsure ids from the list of candidate names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "unsure = df['Taxonomy'][(df['Taxonomy'].str.contains('cf')) & (df['Status']==False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "undecided = [x for x in undecided if x not in unsure.values]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "884"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(undecided)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List of available choices\n",
    "Groups (max 3) of undistinguishable species"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "Sppgroups=['Lycodonus/Lycenchelys',\n",
    "'Lycodonus/Lycenchelys/Lumpenus',\n",
    "'Geodia/Stelleta',\n",
    "'Phakellia/Axinella',\n",
    "'Geodia/Stryphnus',\n",
    "'Porania/Poraniomorpha',\n",
    "'Ceramaster/Hippasterias',\n",
    "'Bythocaris/Boreomysis']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compare to provided list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['Locodonus/Lycenchelys', 'Lycodonus/Lycenchelys', 95]]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map = matchinglist(undecided, Sppgroups,scorelimit=95, method='token_sort', perfectmatch=True)\n",
    "map"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Replace and set status as OK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n",
      "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "for i in map:\n",
    "    df['To_name'][df['Taxonomy']==i[0]] = i[1]\n",
    "    df['Status'][df['Taxonomy']==i[0]] = True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Repeat until no more matches are found!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('names_v1.csv', index=False, encoding='latin1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Custom names Part 1: Undecided identifications\n",
	"These are limited sets of species-level taxa that are hard to tell apart on footage. They can be standardized because they are recurrent (e.g. \"Geodia/Stelleta\", \"Lycodonus/Lycenchelis\"). "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"from fuzzyutil import *"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.read_csv('names_v0.csv', encoding='latin1')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Unnamed: 0</th>\n",
	" <th>Taxonomy</th>\n",
	" <th>freq</th>\n",
	" <th>last_seen</th>\n",
	" <th>To_name</th>\n",
	" <th>Status</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>Epizooanthidae</td>\n",
	" <td>1</td>\n",
	" <td>107</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2</td>\n",
	" <td>Lebensspuren (echinoderm bulldozing tracks)</td>\n",
	" <td>1</td>\n",
	" <td>112</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>3</td>\n",
	" <td>Paguridae</td>\n",
	" <td>1</td>\n",
	" <td>112</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>4</td>\n",
	" <td>Porifera, apricot incrusting</td>\n",
	" <td>1</td>\n",
	" <td>161</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>5</td>\n",
	" <td>Porifera, many osculi</td>\n",
	" <td>1</td>\n",
	" <td>161</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Unnamed: 0 Taxonomy freq last_seen \\\n",
	"0 1 Epizooanthidae 1 107 \n",
	"1 2 Lebensspuren (echinoderm bulldozing tracks) 1 112 \n",
	"2 3 Paguridae 1 112 \n",
	"3 4 Porifera, apricot incrusting 1 161 \n",
	"4 5 Porifera, many osculi 1 161 \n",
	"\n",
	" To_name Status \n",
	"0 NaN False \n",
	"1 NaN False \n",
	"2 NaN False \n",
	"3 NaN False \n",
	"4 NaN False "
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.head(5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"8289"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(df)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Subset of candidate names\n",
	"Names containing \"/\".\n",
	"\n",
	"Repeat from here, after changing the score limit and/or method below."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"undecided = df['Taxonomy'][((df['Taxonomy'].str.contains('/'))) & (df['Status']==False)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"906"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(undecided)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Remove the unsure ids from the list of candidate names"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [],
	"source": [
	"unsure = df['Taxonomy'][(df['Taxonomy'].str.contains('cf')) & (df['Status']==False)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"undecided = [x for x in undecided if x not in unsure.values]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"884"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(undecided)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## List of available choices\n",
	"Groups (max 3) of undistinguishable species"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"Sppgroups=['Lycodonus/Lycenchelys',\n",
	"'Lycodonus/Lycenchelys/Lumpenus',\n",
	"'Geodia/Stelleta',\n",
	"'Phakellia/Axinella',\n",
	"'Geodia/Stryphnus',\n",
	"'Porania/Poraniomorpha',\n",
	"'Ceramaster/Hippasterias',\n",
	"'Bythocaris/Boreomysis']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Compare to provided list"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[['Locodonus/Lycenchelys', 'Lycodonus/Lycenchelys', 95]]"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"map = matchinglist(undecided, Sppgroups,scorelimit=95, method='token_sort', perfectmatch=True)\n",
	"map"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"* Replace and set status as OK"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" \n",
	"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" This is separate from the ipykernel package so we can avoid doing imports until\n"
	]
	}
	],
	"source": [
	"for i in map:\n",
	" df['To_name'][df['Taxonomy']==i[0]] = i[1]\n",
	" df['Status'][df['Taxonomy']==i[0]] = True"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"* Repeat until no more matches are found!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.to_csv('names_v1.csv', index=False, encoding='latin1')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}