kvnkho/comparison.py

## comparison.py
# Comparison of creating inferred_state column
area_to_state = {"217": "IL", "312": "IL", "415": "CA", "352": "FL"}

# Pandas implementation
df['inferred_state'] = df['home_state']\
                         .fillna(df['work_state'])\
                         .fillna(df['phone'].str.slice(0,3).map(area_to_state))

# Spark implementation
from pyspark.sql.functions import coalesce, col, substring, create_map, lit
from itertools import chain

mapping_expr = create_map([lit(x) for x in chain(*area_to_state.items())])
df = df.withColumn('inferred_state',
                   coalesce('home_state',
                            'work_state',
                             mapping_expr.getItem(substring(col("phone"), 0, 3))
                            )
                   )
	# Comparison of creating inferred_state column
	area_to_state = {"217": "IL", "312": "IL", "415": "CA", "352": "FL"}

	# Pandas implementation
	df['inferred_state'] = df['home_state']\
	.fillna(df['work_state'])\
	.fillna(df['phone'].str.slice(0,3).map(area_to_state))

	# Spark implementation
	from pyspark.sql.functions import coalesce, col, substring, create_map, lit
	from itertools import chain

	mapping_expr = create_map([lit(x) for x in chain(*area_to_state.items())])
	df = df.withColumn('inferred_state',
	coalesce('home_state',
	'work_state',
	mapping_expr.getItem(substring(col("phone"), 0, 3))
	)
	)