epifanio/species clean 4 lazy.ipynb

## species clean 4 lazy.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Custom names Part 4: \"Lazy\" names\n",
    "Taxa known solely by the genus name even though the species is known (it's just obviated) e.g. \"Mycale\" (we know it's \"Mycale lingua\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fuzzyutil import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('names_v3.csv', encoding='latin1')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List of candidates\n",
    "Names that contain only one term in them, when tidied!!!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_allspecies = df['Taxonomy'][df['Status']==False]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8073"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(unique_allspecies)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_oneterm = [i for i in tidy(unique_allspecies) if len(i.split()) == 1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remove those with a \"/\" (they have one apparent term, but they should not be here)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#undecided = df['Species'][(df['Species'].str.contains('/')) & (df['Status']==False)]\n",
    "undecided = [s for s in unique_oneterm if \"/\" in s]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_oneterm = [x for x in unique_oneterm if x not in undecided]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "#unique_oneterm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Remove those with a question mark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "unsure = [s for s in unique_oneterm if \"?\" in s]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_oneterm = [x for x in unique_oneterm if x not in unsure]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Replace using lookup table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "lut = pd.read_csv(\"species list lazy.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['Acesta', 'Acesta', 100],\n",
       " ['Acesta', 'Acesta', 100],\n",
       " ['Aplysilla', 'Aplysilla', 100],\n",
       " ['Craniella', 'Craniella', 100],\n",
       " ['Ditrupa', 'Ditrupa', 100],\n",
       " ['itrupa', 'Ditrupa', 92],\n",
       " ['Flabellum', 'Flabellum', 100],\n",
       " ['Hymenaster', 'Hymenaster', 100],\n",
       " ['Madrepore', 'Madrepora', 89],\n",
       " ['Stichastrella', 'Stichastrella', 100],\n",
       " ['Virgularia', 'Virgularia', 100]]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oneterm_matches = matchinglist(unique_oneterm, \n",
    "             lut['lazy name'].tolist(), \n",
    "             scorelimit=87, \n",
    "             method='ratio',\n",
    "             perfectmatch=True)\n",
    "oneterm_matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n",
      "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  after removing the cwd from sys.path.\n"
     ]
    }
   ],
   "source": [
    "for i in oneterm_matches:\n",
    "    cr = lut['correct name'][lut['lazy name']==i[1]].values\n",
    "    df['To_name'][df['Taxonomy']==i[0]] = cr\n",
    "    df['Status'][df['Taxonomy']==i[0]] = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('names_v4.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Custom names Part 4: \"Lazy\" names\n",
	"Taxa known solely by the genus name even though the species is known (it's just obviated) e.g. \"Mycale\" (we know it's \"Mycale lingua\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"from fuzzyutil import *"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.read_csv('names_v3.csv', encoding='latin1')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## List of candidates\n",
	"Names that contain only one term in them, when tidied!!!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [],
	"source": [
	"unique_allspecies = df['Taxonomy'][df['Status']==False]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"8073"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(unique_allspecies)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [],
	"source": [
	"unique_oneterm = [i for i in tidy(unique_allspecies) if len(i.split()) == 1]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Remove those with a \"/\" (they have one apparent term, but they should not be here)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [],
	"source": [
	"#undecided = df['Species'][(df['Species'].str.contains('/')) & (df['Status']==False)]\n",
	"undecided = [s for s in unique_oneterm if \"/\" in s]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [],
	"source": [
	"unique_oneterm = [x for x in unique_oneterm if x not in undecided]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [],
	"source": [
	"#unique_oneterm"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"* Remove those with a question mark"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [],
	"source": [
	"unsure = [s for s in unique_oneterm if \"?\" in s]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [],
	"source": [
	"unique_oneterm = [x for x in unique_oneterm if x not in unsure]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Replace using lookup table"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {},
	"outputs": [],
	"source": [
	"lut = pd.read_csv(\"species list lazy.csv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[['Acesta', 'Acesta', 100],\n",
	" ['Acesta', 'Acesta', 100],\n",
	" ['Aplysilla', 'Aplysilla', 100],\n",
	" ['Craniella', 'Craniella', 100],\n",
	" ['Ditrupa', 'Ditrupa', 100],\n",
	" ['itrupa', 'Ditrupa', 92],\n",
	" ['Flabellum', 'Flabellum', 100],\n",
	" ['Hymenaster', 'Hymenaster', 100],\n",
	" ['Madrepore', 'Madrepora', 89],\n",
	" ['Stichastrella', 'Stichastrella', 100],\n",
	" ['Virgularia', 'Virgularia', 100]]"
	]
	},
	"execution_count": 43,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"oneterm_matches = matchinglist(unique_oneterm, \n",
	" lut['lazy name'].tolist(), \n",
	" scorelimit=87, \n",
	" method='ratio',\n",
	" perfectmatch=True)\n",
	"oneterm_matches"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" This is separate from the ipykernel package so we can avoid doing imports until\n",
	"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" after removing the cwd from sys.path.\n"
	]
	}
	],
	"source": [
	"for i in oneterm_matches:\n",
	" cr = lut['correct name'][lut['lazy name']==i[1]].values\n",
	" df['To_name'][df['Taxonomy']==i[0]] = cr\n",
	" df['Status'][df['Taxonomy']==i[0]] = True"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.to_csv('names_v4.csv', index=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}