codebrain001/Reformat the table structure.ipynb

## Reformat the table structure.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = dask_df[\"county\"].unique().compute()\n",
    "county = dict((i,dict()) for i in list(c))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Creating a new dataframe would have been done by `dd.DataFrame()` but dask advices us not use this class directly.  Instead use functions like\n",
    "``dd.read_csv``, ``dd.read_parquet``, or ``dd.from_pandas``.\n",
    "So, we will work with pandas to create a new dataframe then convert it to a Dask dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "i = 0\n",
    "data = []\n",
    "\n",
    "for row in range(len(dask_df)):\n",
    "    \n",
    "    df = dask_df.compute()\n",
    "    \n",
    "    c = df.loc[row,\"county\"]\n",
    "    s = df.loc[row,\"state\"]\n",
    "    f = df.loc[row,\"FIPS\"]\n",
    "    y = df.loc[row, \"year\"]\n",
    "    \n",
    "    can_nm = df.loc[row, \"candidate\"]\n",
    "    party =  df.loc[row, \"party\"]\n",
    "    votes =  df.loc[row, \"candidatevotes\"]\n",
    "    year = df.loc[row, \"year\"]\n",
    "    \n",
    "    if f not in county[c].keys():\n",
    "        county[c][f] = {}\n",
    "        \n",
    "    county[c][f]['county'] = c\n",
    "    county[c][f][\"fips\"] = f\n",
    "    county[c][f][f\"candidate({party.strip()[0]})\"] = can_nm\n",
    "    county[c][f][f\"votes ({party.strip()[0]})\"] = votes\n",
    "    county[c][f]['year'] = y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = []\n",
    "for key, items in county.items():\n",
    "\n",
    "    for key, item in items.items():\n",
    "        data.append(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "dt = pd.DataFrame(data)\n",
    "df = dd.from_pandas(dt,npartitions=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>county</th>\n",
       "      <th>fips</th>\n",
       "      <th>candidate(d)</th>\n",
       "      <th>votes (d)</th>\n",
       "      <th>year</th>\n",
       "      <th>candidate(r)</th>\n",
       "      <th>votes (r)</th>\n",
       "      <th>candidate(O)</th>\n",
       "      <th>votes (O)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Autauga</td>\n",
       "      <td>01001</td>\n",
       "      <td>Hillary Clinton</td>\n",
       "      <td>5936.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>Donald Trump</td>\n",
       "      <td>18172.0</td>\n",
       "      <td>Other</td>\n",
       "      <td>865.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Baldwin</td>\n",
       "      <td>01003</td>\n",
       "      <td>Hillary Clinton</td>\n",
       "      <td>18458.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>Donald Trump</td>\n",
       "      <td>72883.0</td>\n",
       "      <td>Other</td>\n",
       "      <td>3874.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Baldwin</td>\n",
       "      <td>13009</td>\n",
       "      <td>Hillary Clinton</td>\n",
       "      <td>7970.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>Donald Trump</td>\n",
       "      <td>7697.0</td>\n",
       "      <td>Other</td>\n",
       "      <td>449.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Barbour</td>\n",
       "      <td>01005</td>\n",
       "      <td>Hillary Clinton</td>\n",
       "      <td>4871.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>Donald Trump</td>\n",
       "      <td>5454.0</td>\n",
       "      <td>Other</td>\n",
       "      <td>144.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Barbour</td>\n",
       "      <td>54001</td>\n",
       "      <td>Hillary Clinton</td>\n",
       "      <td>1222.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>Donald Trump</td>\n",
       "      <td>4527.0</td>\n",
       "      <td>Other</td>\n",
       "      <td>305.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    county   fips     candidate(d)  ...  votes (r)  candidate(O) votes (O)\n",
       "0  Autauga  01001  Hillary Clinton  ...    18172.0         Other     865.0\n",
       "1  Baldwin  01003  Hillary Clinton  ...    72883.0         Other    3874.0\n",
       "2  Baldwin  13009  Hillary Clinton  ...     7697.0         Other     449.0\n",
       "3  Barbour  01005  Hillary Clinton  ...     5454.0         Other     144.0\n",
       "4  Barbour  54001  Hillary Clinton  ...     4527.0         Other     305.0\n",
       "\n",
       "[5 rows x 9 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"c = dask_df[\"county\"].unique().compute()\n",
	"county = dict((i,dict()) for i in list(c))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Creating a new dataframe would have been done by `dd.DataFrame()` but dask advices us not use this class directly. Instead use functions like\n",
	"``dd.read_csv``, ``dd.read_parquet``, or ``dd.from_pandas``.\n",
	"So, we will work with pandas to create a new dataframe then convert it to a Dask dataframe."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"i = 0\n",
	"data = []\n",
	"\n",
	"for row in range(len(dask_df)):\n",
	" \n",
	" df = dask_df.compute()\n",
	" \n",
	" c = df.loc[row,\"county\"]\n",
	" s = df.loc[row,\"state\"]\n",
	" f = df.loc[row,\"FIPS\"]\n",
	" y = df.loc[row, \"year\"]\n",
	" \n",
	" can_nm = df.loc[row, \"candidate\"]\n",
	" party = df.loc[row, \"party\"]\n",
	" votes = df.loc[row, \"candidatevotes\"]\n",
	" year = df.loc[row, \"year\"]\n",
	" \n",
	" if f not in county[c].keys():\n",
	" county[c][f] = {}\n",
	" \n",
	" county[c][f]['county'] = c\n",
	" county[c][f][\"fips\"] = f\n",
	" county[c][f][f\"candidate({party.strip()[0]})\"] = can_nm\n",
	" county[c][f][f\"votes ({party.strip()[0]})\"] = votes\n",
	" county[c][f]['year'] = y"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = []\n",
	"for key, items in county.items():\n",
	"\n",
	" for key, item in items.items():\n",
	" data.append(item)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"dt = pd.DataFrame(data)\n",
	"df = dd.from_pandas(dt,npartitions=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>county</th>\n",
	" <th>fips</th>\n",
	" <th>candidate(d)</th>\n",
	" <th>votes (d)</th>\n",
	" <th>year</th>\n",
	" <th>candidate(r)</th>\n",
	" <th>votes (r)</th>\n",
	" <th>candidate(O)</th>\n",
	" <th>votes (O)</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>Autauga</td>\n",
	" <td>01001</td>\n",
	" <td>Hillary Clinton</td>\n",
	" <td>5936.0</td>\n",
	" <td>2016</td>\n",
	" <td>Donald Trump</td>\n",
	" <td>18172.0</td>\n",
	" <td>Other</td>\n",
	" <td>865.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Baldwin</td>\n",
	" <td>01003</td>\n",
	" <td>Hillary Clinton</td>\n",
	" <td>18458.0</td>\n",
	" <td>2016</td>\n",
	" <td>Donald Trump</td>\n",
	" <td>72883.0</td>\n",
	" <td>Other</td>\n",
	" <td>3874.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Baldwin</td>\n",
	" <td>13009</td>\n",
	" <td>Hillary Clinton</td>\n",
	" <td>7970.0</td>\n",
	" <td>2016</td>\n",
	" <td>Donald Trump</td>\n",
	" <td>7697.0</td>\n",
	" <td>Other</td>\n",
	" <td>449.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>Barbour</td>\n",
	" <td>01005</td>\n",
	" <td>Hillary Clinton</td>\n",
	" <td>4871.0</td>\n",
	" <td>2016</td>\n",
	" <td>Donald Trump</td>\n",
	" <td>5454.0</td>\n",
	" <td>Other</td>\n",
	" <td>144.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>Barbour</td>\n",
	" <td>54001</td>\n",
	" <td>Hillary Clinton</td>\n",
	" <td>1222.0</td>\n",
	" <td>2016</td>\n",
	" <td>Donald Trump</td>\n",
	" <td>4527.0</td>\n",
	" <td>Other</td>\n",
	" <td>305.0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" county fips candidate(d) ... votes (r) candidate(O) votes (O)\n",
	"0 Autauga 01001 Hillary Clinton ... 18172.0 Other 865.0\n",
	"1 Baldwin 01003 Hillary Clinton ... 72883.0 Other 3874.0\n",
	"2 Baldwin 13009 Hillary Clinton ... 7697.0 Other 449.0\n",
	"3 Barbour 01005 Hillary Clinton ... 5454.0 Other 144.0\n",
	"4 Barbour 54001 Hillary Clinton ... 4527.0 Other 305.0\n",
	"\n",
	"[5 rows x 9 columns]"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.head()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}