Skip to content

Instantly share code, notes, and snippets.

@yogenderPalChandra
Created July 3, 2022 09:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yogenderPalChandra/98ef00d884fec385afdd7d46dd493711 to your computer and use it in GitHub Desktop.
Save yogenderPalChandra/98ef00d884fec385afdd7d46dd493711 to your computer and use it in GitHub Desktop.
import os
def file_for_regex_transformer():
"""Regex rules to file in current directory
This function returns path to file with rules string for Regex matcher pipline
in the function nlp_pipline_and_clean(rdd_df)
"""
rules = '''.\d\&\w+\;\d+&\w+;\d+&\w+;Kč*'''
with open('regex_rules.txt', 'w') as f:
f.write(rules)
return os.path.join(os.getcwd(), "regex_rules.txt")
RegexMatcher().extractParamMap()
def nlp_pipline_and_clean(rdd_df):
"""takes rdd dataframe rdd_df and returns regex matches item i.e. class="norm-price ng-binding
:DocumentAssembler(): is a sparknlp.base class Transformer
which takes rdd with input column (setInputCol()) ->test and returns rdd with column 'assembled'
:RegexMatcher(): is a the Spark NLP transformer which actually does regex matching
of the string we defined in the previous function. It takes 'assembled' column as input and returns
'regex_matches' column
:nlpPipeline: a pipline is initialised and called on the rdd_df
"""
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("assembled")
regex_matcher = RegexMatcher()\
.setInputCols('assembled')\
.setStrategy("MATCH_ALL")\
.setOutputCol("regex_matches")\
.setExternalRules(path=file_for_regex_transformer(), delimiter=',')
nlpPipeline = Pipeline(stages=[
documentAssembler,
regex_matcher
])
return nlpPipeline.fit(rdd_df).transform(rdd_df) \
.select("regex_matches.result") \
.rdd.flatMap(lambda x: x[0])\
.map(lambda s: s.replace(' ', '')) \
.map(lambda s: s.replace('>', '')) \
.map(lambda s: s.replace('>', '')) \
.map(lambda s: s.replace('Kč', ' Kč')) \
.map(lambda s: s.split()) \
.map(lambda s: '{:,} {}'.format(int(s[0]), str(s[1])))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment