Skip to content

Instantly share code, notes, and snippets.

@codebrain001
Created April 12, 2020 22:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save codebrain001/77c4cdfdfe057cc661e56056eca35a71 to your computer and use it in GitHub Desktop.
Save codebrain001/77c4cdfdfe057cc661e56056eca35a71 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"c = dask_df[\"county\"].unique().compute()\n",
"county = dict((i,dict()) for i in list(c))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Creating a new dataframe would have been done by `dd.DataFrame()` but dask advices us not use this class directly. Instead use functions like\n",
"``dd.read_csv``, ``dd.read_parquet``, or ``dd.from_pandas``.\n",
"So, we will work with pandas to create a new dataframe then convert it to a Dask dataframe."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"i = 0\n",
"data = []\n",
"\n",
"for row in range(len(dask_df)):\n",
" \n",
" df = dask_df.compute()\n",
" \n",
" c = df.loc[row,\"county\"]\n",
" s = df.loc[row,\"state\"]\n",
" f = df.loc[row,\"FIPS\"]\n",
" y = df.loc[row, \"year\"]\n",
" \n",
" can_nm = df.loc[row, \"candidate\"]\n",
" party = df.loc[row, \"party\"]\n",
" votes = df.loc[row, \"candidatevotes\"]\n",
" year = df.loc[row, \"year\"]\n",
" \n",
" if f not in county[c].keys():\n",
" county[c][f] = {}\n",
" \n",
" county[c][f]['county'] = c\n",
" county[c][f][\"fips\"] = f\n",
" county[c][f][f\"candidate({party.strip()[0]})\"] = can_nm\n",
" county[c][f][f\"votes ({party.strip()[0]})\"] = votes\n",
" county[c][f]['year'] = y"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"data = []\n",
"for key, items in county.items():\n",
"\n",
" for key, item in items.items():\n",
" data.append(item)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"dt = pd.DataFrame(data)\n",
"df = dd.from_pandas(dt,npartitions=1)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>county</th>\n",
" <th>fips</th>\n",
" <th>candidate(d)</th>\n",
" <th>votes (d)</th>\n",
" <th>year</th>\n",
" <th>candidate(r)</th>\n",
" <th>votes (r)</th>\n",
" <th>candidate(O)</th>\n",
" <th>votes (O)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Autauga</td>\n",
" <td>01001</td>\n",
" <td>Hillary Clinton</td>\n",
" <td>5936.0</td>\n",
" <td>2016</td>\n",
" <td>Donald Trump</td>\n",
" <td>18172.0</td>\n",
" <td>Other</td>\n",
" <td>865.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Baldwin</td>\n",
" <td>01003</td>\n",
" <td>Hillary Clinton</td>\n",
" <td>18458.0</td>\n",
" <td>2016</td>\n",
" <td>Donald Trump</td>\n",
" <td>72883.0</td>\n",
" <td>Other</td>\n",
" <td>3874.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Baldwin</td>\n",
" <td>13009</td>\n",
" <td>Hillary Clinton</td>\n",
" <td>7970.0</td>\n",
" <td>2016</td>\n",
" <td>Donald Trump</td>\n",
" <td>7697.0</td>\n",
" <td>Other</td>\n",
" <td>449.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Barbour</td>\n",
" <td>01005</td>\n",
" <td>Hillary Clinton</td>\n",
" <td>4871.0</td>\n",
" <td>2016</td>\n",
" <td>Donald Trump</td>\n",
" <td>5454.0</td>\n",
" <td>Other</td>\n",
" <td>144.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Barbour</td>\n",
" <td>54001</td>\n",
" <td>Hillary Clinton</td>\n",
" <td>1222.0</td>\n",
" <td>2016</td>\n",
" <td>Donald Trump</td>\n",
" <td>4527.0</td>\n",
" <td>Other</td>\n",
" <td>305.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" county fips candidate(d) ... votes (r) candidate(O) votes (O)\n",
"0 Autauga 01001 Hillary Clinton ... 18172.0 Other 865.0\n",
"1 Baldwin 01003 Hillary Clinton ... 72883.0 Other 3874.0\n",
"2 Baldwin 13009 Hillary Clinton ... 7697.0 Other 449.0\n",
"3 Barbour 01005 Hillary Clinton ... 5454.0 Other 144.0\n",
"4 Barbour 54001 Hillary Clinton ... 4527.0 Other 305.0\n",
"\n",
"[5 rows x 9 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment