Last active
July 9, 2019 12:09
-
-
Save drorata/23feb478e5723ca7074e79dec630c171 to your computer and use it in GitHub Desktop.
Mapping values to integers/floats/doubles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Manual Mapping of Ordinal Features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"from itertools import chain\n", | |
"from pyspark.sql import functions as F\n", | |
"from pyspark.sql import SparkSession" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"spark = SparkSession \\\n", | |
" .builder \\\n", | |
" .appName(\"Pysparkexample\") \\\n", | |
" .config(\"spark.some.config.option\", \"some-value\") \\\n", | |
" .getOrCreate()\n", | |
"sc = spark.sparkContext" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Assume having columns in a dataframe with ordinal values which you want to map to integers (or doubles)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----+------+\n", | |
"|feat1| feat2|\n", | |
"+-----+------+\n", | |
"| HI| LARGE|\n", | |
"| MID|MEDIUM|\n", | |
"| LO| SMALL|\n", | |
"+-----+------+\n", | |
"only showing top 3 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"def fresh_df(N=100000, seed=None):\n", | |
" np.random.seed(seed)\n", | |
" feat1 = np.random.choice([\"HI\", \"LO\", \"MID\"], size=N)\n", | |
" feat2 = np.random.choice([\"SMALL\", \"MEDIUM\", \"LARGE\"], size=N)\n", | |
"\n", | |
" pdf = pd.DataFrame({\n", | |
" \"feat1\": feat1,\n", | |
" \"feat2\": feat2\n", | |
" })\n", | |
" return spark.createDataFrame(pdf)\n", | |
"\n", | |
"fresh_df(N=100).show(3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"So, you build manually the mappings:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"feat1_dict = {\"HI\": 1, \"MID\": 2, \"LO\": 3}\n", | |
"feat2_dict = {\"SMALL\": 0, \"MEDIUM\": 1, \"LARGE\": 2}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"And turn each of them into a dataframe of its own:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"feat1_map_df = spark.createDataFrame(pd.DataFrame([(k,v) for k, v in feat1_dict.items()], columns=[\"feat\", \"label\"]))\n", | |
"feat2_map_df = spark.createDataFrame(pd.DataFrame([(k,v) for k, v in feat2_dict.items()], columns=[\"feat\", \"label\"]))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Collect the mapping dataframes in a dictionary:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"mappings = {\n", | |
" \"feat1\": feat1_map_df,\n", | |
" \"feat2\": feat2_map_df\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----+------+\n", | |
"|feat1| feat2|\n", | |
"+-----+------+\n", | |
"| MID| LARGE|\n", | |
"| HI| LARGE|\n", | |
"| MID| SMALL|\n", | |
"| MID| LARGE|\n", | |
"| HI|MEDIUM|\n", | |
"| HI| SMALL|\n", | |
"| MID|MEDIUM|\n", | |
"| LO|MEDIUM|\n", | |
"| MID|MEDIUM|\n", | |
"| MID|MEDIUM|\n", | |
"+-----+------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df = fresh_df(N=10, seed=42)\n", | |
"df.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Using `join`s you can now replace the values using the mappings:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----+-----+\n", | |
"|feat|label|\n", | |
"+----+-----+\n", | |
"| HI| 1|\n", | |
"| MID| 2|\n", | |
"| LO| 3|\n", | |
"+----+-----+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"mappings[\"feat1\"].show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def label_column(df, mapping, feat_name):\n", | |
" return df.join(F.broadcast(mapping), df[feat_name] == mapping.feat)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Here's an example:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = fresh_df(N=1000000, seed=42)\n", | |
"cols = df.columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"250 ms ± 88.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"for col in cols:\n", | |
" df2 = label_column(df, mappings[col], col).drop('feat')\n", | |
" df2 = df2.withColumnRenamed(\"label\", col + \"_mapped\")\n", | |
"df2.cache().count()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment