Skip to content

Instantly share code, notes, and snippets.

@yifeihuang
Created September 6, 2020 20:56
Show Gist options
  • Save yifeihuang/7dad7494b33dec613c6724d6bdf67738 to your computer and use it in GitHub Desktop.
Save yifeihuang/7dad7494b33dec613c6724d6bdf67738 to your computer and use it in GitHub Desktop.
Data source normalization
df = google.select(
f.lit('google').alias('source'),
f.col('id').alias('source_id'),
f.col('name'), f.col('description'),
f.col('manufacturer'),
f.col('price')
)\
.union(
amazon.select(
f.lit('amazon').alias('source'),
f.col('id').alias('source_id'),
f.col('title').alias('name'),
f.col('description'),
f.col('manufacturer'),
f.col('price')
)
)
def trim_to_null(c):
return (
f.lower(
f.when(f.trim(f.col(c)) == '', None)
.when(f.trim(f.col(c)) == 'null', None)
.otherwise(f.trim(f.col(c)))
)
)
STRING_COLS = ['name', 'description', 'manufacturer']
for c in STRING_COLS:
df = df.withColumn(c, f.lower(trim_to_null(c)))
STRING_NUM_COLS = ['price']
for c in STRING_NUM_COLS:
df = df.withColumn(c, trim_to_null(c).cast('float'))
# hyphenated words and version numbers seems salient to product name
# treat them differently by concatenating
def replace_contiguous_special_char(c, replace_str=''):
return (
f.regexp_replace(c, "(?<=(\d|\w))(\.|-|\')(?=(\d|\w))", replace_str)
)
def replace_special_char(c, replace_str=' '):
return (
f.regexp_replace(c, "[\W]", replace_str)
)
processed_df = df.withColumn('name', replace_special_char('name'))\
.withColumn('description', replace_special_char('description'))\
.withColumn('manufacturer', replace_special_char('manufacturer'))
display(processed_df)
@diegoquintanav
Copy link

diegoquintanav commented Feb 26, 2021

what is f here? I suspect

import pyspark.sql.functions as f

@havardox
Copy link

havardox commented Dec 5, 2022

It seems that the function replace_contiguous_special_char isn't used anywhere in the Medium code. What purpose does it serve?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment