yifeihuang/normalization.py

## normalization.py
df = google.select(
    f.lit('google').alias('source'),
    f.col('id').alias('source_id'),
    f.col('name'), f.col('description'),
    f.col('manufacturer'),
    f.col('price')
  )\
  .union(
    amazon.select(
      f.lit('amazon').alias('source'),
        f.col('id').alias('source_id'),
        f.col('title').alias('name'),
        f.col('description'),
        f.col('manufacturer'),
        f.col('price')
       )
    )

def trim_to_null(c):
  return (
    f.lower(
      f.when(f.trim(f.col(c)) == '', None)
      .when(f.trim(f.col(c)) == 'null', None)
      .otherwise(f.trim(f.col(c)))
    )
  )

STRING_COLS = ['name', 'description', 'manufacturer']
for c in STRING_COLS:
  df = df.withColumn(c, f.lower(trim_to_null(c)))

STRING_NUM_COLS = ['price']
for c in STRING_NUM_COLS:
  df = df.withColumn(c, trim_to_null(c).cast('float'))

# hyphenated words and version numbers seems salient to product name
# treat them differently by concatenating
def replace_contiguous_special_char(c, replace_str=''):
  return (
    f.regexp_replace(c, "(?<=(\d|\w))(\.|-|\')(?=(\d|\w))", replace_str)
  )

def replace_special_char(c, replace_str=' '):
  return (
    f.regexp_replace(c, "[\W]", replace_str)
  )


processed_df = df.withColumn('name', replace_special_char('name'))\
  .withColumn('description', replace_special_char('description'))\
  .withColumn('manufacturer', replace_special_char('manufacturer'))


display(processed_df)
	df = google.select(
	f.lit('google').alias('source'),
	f.col('id').alias('source_id'),
	f.col('name'), f.col('description'),
	f.col('manufacturer'),
	f.col('price')
	)\
	.union(
	amazon.select(
	f.lit('amazon').alias('source'),
	f.col('id').alias('source_id'),
	f.col('title').alias('name'),
	f.col('description'),
	f.col('manufacturer'),
	f.col('price')
	)
	)

	def trim_to_null(c):
	return (
	f.lower(
	f.when(f.trim(f.col(c)) == '', None)
	.when(f.trim(f.col(c)) == 'null', None)
	.otherwise(f.trim(f.col(c)))
	)
	)

	STRING_COLS = ['name', 'description', 'manufacturer']
	for c in STRING_COLS:
	df = df.withColumn(c, f.lower(trim_to_null(c)))

	STRING_NUM_COLS = ['price']
	for c in STRING_NUM_COLS:
	df = df.withColumn(c, trim_to_null(c).cast('float'))

	# hyphenated words and version numbers seems salient to product name
	# treat them differently by concatenating
	def replace_contiguous_special_char(c, replace_str=''):
	return (
	f.regexp_replace(c, "(?<=(\d\|\w))(\.\|-\|\')(?=(\d\|\w))", replace_str)
	)

	def replace_special_char(c, replace_str=' '):
	return (
	f.regexp_replace(c, "[\W]", replace_str)
	)



	processed_df = df.withColumn('name', replace_special_char('name'))\
	.withColumn('description', replace_special_char('description'))\
	.withColumn('manufacturer', replace_special_char('manufacturer'))


	display(processed_df)