epifanio/Join_dataframes.ipynb

## Join_dataframes.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Join_dataframes.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## names_check.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              names_check.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## species clean 0 custom names.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Custom names Part 1: Undecided identifications\n",
    "These are limited sets of species-level taxa that are hard to tell apart on footage. They can be standardized because they are recurrent (e.g. \"Geodia/Stelleta\", \"Lycodonus/Lycenchelis\"). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fuzzyutil import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('names_v0.csv', encoding='latin1')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Taxonomy</th>\n",
       "      <th>freq</th>\n",
       "      <th>last_seen</th>\n",
       "      <th>To_name</th>\n",
       "      <th>Status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Epizooanthidae</td>\n",
       "      <td>1</td>\n",
       "      <td>107</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lebensspuren (echinoderm bulldozing tracks)</td>\n",
       "      <td>1</td>\n",
       "      <td>112</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Paguridae</td>\n",
       "      <td>1</td>\n",
       "      <td>112</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Porifera, apricot incrusting</td>\n",
       "      <td>1</td>\n",
       "      <td>161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Porifera, many osculi</td>\n",
       "      <td>1</td>\n",
       "      <td>161</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0                                      Taxonomy  freq  last_seen  \\\n",
       "0           1                                Epizooanthidae     1        107   \n",
       "1           2   Lebensspuren (echinoderm bulldozing tracks)     1        112   \n",
       "2           3                                     Paguridae     1        112   \n",
       "3           4                  Porifera, apricot incrusting     1        161   \n",
       "4           5                         Porifera, many osculi     1        161   \n",
       "\n",
       "   To_name  Status  \n",
       "0      NaN   False  \n",
       "1      NaN   False  \n",
       "2      NaN   False  \n",
       "3      NaN   False  \n",
       "4      NaN   False  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8289"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Subset of candidate names\n",
    "Names containing \"/\".\n",
    "\n",
    "Repeat from here, after changing the score limit and/or method below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "undecided = df['Taxonomy'][((df['Taxonomy'].str.contains('/'))) & (df['Status']==False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "906"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(undecided)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remove the unsure ids from the list of candidate names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "unsure = df['Taxonomy'][(df['Taxonomy'].str.contains('cf')) & (df['Status']==False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "undecided = [x for x in undecided if x not in unsure.values]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "884"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(undecided)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List of available choices\n",
    "Groups (max 3) of undistinguishable species"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "Sppgroups=['Lycodonus/Lycenchelys',\n",
    "'Lycodonus/Lycenchelys/Lumpenus',\n",
    "'Geodia/Stelleta',\n",
    "'Phakellia/Axinella',\n",
    "'Geodia/Stryphnus',\n",
    "'Porania/Poraniomorpha',\n",
    "'Ceramaster/Hippasterias',\n",
    "'Bythocaris/Boreomysis']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compare to provided list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['Locodonus/Lycenchelys', 'Lycodonus/Lycenchelys', 95]]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map = matchinglist(undecided, Sppgroups,scorelimit=95, method='token_sort', perfectmatch=True)\n",
    "map"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Replace and set status as OK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n",
      "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "for i in map:\n",
    "    df['To_name'][df['Taxonomy']==i[0]] = i[1]\n",
    "    df['Status'][df['Taxonomy']==i[0]] = True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Repeat until no more matches are found!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('names_v1.csv', index=False, encoding='latin1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## species clean 2 morphospecies.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              species clean 2 morphospecies.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## species clean 3 provisional names.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              species clean 3 provisional names.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## species clean 4 lazy.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              species clean 4 lazy.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Species clean order by similarity.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Species clean order by similarity.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Species Clean Summary.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Species Clean Summary.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## species clean 5 worms-DEV-V2.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              species clean 5 worms-DEV-V2.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Custom names Part 1: Undecided identifications\n",
	"These are limited sets of species-level taxa that are hard to tell apart on footage. They can be standardized because they are recurrent (e.g. \"Geodia/Stelleta\", \"Lycodonus/Lycenchelis\"). "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"from fuzzyutil import *"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.read_csv('names_v0.csv', encoding='latin1')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Unnamed: 0</th>\n",
	" <th>Taxonomy</th>\n",
	" <th>freq</th>\n",
	" <th>last_seen</th>\n",
	" <th>To_name</th>\n",
	" <th>Status</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>Epizooanthidae</td>\n",
	" <td>1</td>\n",
	" <td>107</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2</td>\n",
	" <td>Lebensspuren (echinoderm bulldozing tracks)</td>\n",
	" <td>1</td>\n",
	" <td>112</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>3</td>\n",
	" <td>Paguridae</td>\n",
	" <td>1</td>\n",
	" <td>112</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>4</td>\n",
	" <td>Porifera, apricot incrusting</td>\n",
	" <td>1</td>\n",
	" <td>161</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>5</td>\n",
	" <td>Porifera, many osculi</td>\n",
	" <td>1</td>\n",
	" <td>161</td>\n",
	" <td>NaN</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Unnamed: 0 Taxonomy freq last_seen \\\n",
	"0 1 Epizooanthidae 1 107 \n",
	"1 2 Lebensspuren (echinoderm bulldozing tracks) 1 112 \n",
	"2 3 Paguridae 1 112 \n",
	"3 4 Porifera, apricot incrusting 1 161 \n",
	"4 5 Porifera, many osculi 1 161 \n",
	"\n",
	" To_name Status \n",
	"0 NaN False \n",
	"1 NaN False \n",
	"2 NaN False \n",
	"3 NaN False \n",
	"4 NaN False "
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.head(5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"8289"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(df)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Subset of candidate names\n",
	"Names containing \"/\".\n",
	"\n",
	"Repeat from here, after changing the score limit and/or method below."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"undecided = df['Taxonomy'][((df['Taxonomy'].str.contains('/'))) & (df['Status']==False)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"906"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(undecided)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Remove the unsure ids from the list of candidate names"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [],
	"source": [
	"unsure = df['Taxonomy'][(df['Taxonomy'].str.contains('cf')) & (df['Status']==False)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"undecided = [x for x in undecided if x not in unsure.values]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"884"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(undecided)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## List of available choices\n",
	"Groups (max 3) of undistinguishable species"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"Sppgroups=['Lycodonus/Lycenchelys',\n",
	"'Lycodonus/Lycenchelys/Lumpenus',\n",
	"'Geodia/Stelleta',\n",
	"'Phakellia/Axinella',\n",
	"'Geodia/Stryphnus',\n",
	"'Porania/Poraniomorpha',\n",
	"'Ceramaster/Hippasterias',\n",
	"'Bythocaris/Boreomysis']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Compare to provided list"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[['Locodonus/Lycenchelys', 'Lycodonus/Lycenchelys', 95]]"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"map = matchinglist(undecided, Sppgroups,scorelimit=95, method='token_sort', perfectmatch=True)\n",
	"map"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"* Replace and set status as OK"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" \n",
	"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" This is separate from the ipykernel package so we can avoid doing imports until\n"
	]
	}
	],
	"source": [
	"for i in map:\n",
	" df['To_name'][df['Taxonomy']==i[0]] = i[1]\n",
	" df['Status'][df['Taxonomy']==i[0]] = True"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"* Repeat until no more matches are found!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.to_csv('names_v1.csv', index=False, encoding='latin1')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}