Skip to content

Instantly share code, notes, and snippets.

@sansagara
Last active November 16, 2020 12:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sansagara/0619f21e9e56d2547e028669058644e8 to your computer and use it in GitHub Desktop.
Save sansagara/0619f21e9e56d2547e028669058644e8 to your computer and use it in GitHub Desktop.
Tag-based feature engineering
import operator as op
from typing import Any, Dict, List, Union
from pyspark.sql import functions as f
def regex_like(inputs: str, values: Union[List[str], str], tag_name: str) -> pyspark.sql.Column:
"""
Adds a tag to this row if value matches the given regex.
"""
if isinstance(values, str):
values = [values]
return (
f.when(f.col(inputs).rlike(values), tag_name)
.otherwise("null")
.alias(tag_name)
)
def is_in(inputs: str, values: List[str], tag_name: str) -> pyspark.sql.Column:
"""
Adds a tag to this row if the value is in inputs array.
"""
return (
f.when(f.col(inputs).isin(values), tag_name)
.otherwise("null")
.alias(tag_name)
)
def greater_equal(inputs: str, values: Union[int, float], tag_name: str) -> pyspark.sql.Column:
"""
Adds a tag to this row if value is greater or equal than input.
"""
return (
f.when(f.col(inputs) >= f.lit(values), tag_name)
.otherwise("null")
.alias(tag_name)
)
def greater(inputs: str, values: Union[int, float], tag_name: str) -> pyspark.sql.Column:
"""
Adds tag as the column value if column is greater than value.
"""
return (
f.when(f.col(inputs) > f.lit(values), tag_name)
.otherwise("null")
.alias(tag_name)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment