-
-
Save yogenderPalChandra/98ef00d884fec385afdd7d46dd493711 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
def file_for_regex_transformer(): | |
"""Regex rules to file in current directory | |
This function returns path to file with rules string for Regex matcher pipline | |
in the function nlp_pipline_and_clean(rdd_df) | |
""" | |
rules = '''.\d\&\w+\;\d+&\w+;\d+&\w+;Kč*''' | |
with open('regex_rules.txt', 'w') as f: | |
f.write(rules) | |
return os.path.join(os.getcwd(), "regex_rules.txt") | |
RegexMatcher().extractParamMap() | |
def nlp_pipline_and_clean(rdd_df): | |
"""takes rdd dataframe rdd_df and returns regex matches item i.e. class="norm-price ng-binding | |
:DocumentAssembler(): is a sparknlp.base class Transformer | |
which takes rdd with input column (setInputCol()) ->test and returns rdd with column 'assembled' | |
:RegexMatcher(): is a the Spark NLP transformer which actually does regex matching | |
of the string we defined in the previous function. It takes 'assembled' column as input and returns | |
'regex_matches' column | |
:nlpPipeline: a pipline is initialised and called on the rdd_df | |
""" | |
documentAssembler = DocumentAssembler()\ | |
.setInputCol("text")\ | |
.setOutputCol("assembled") | |
regex_matcher = RegexMatcher()\ | |
.setInputCols('assembled')\ | |
.setStrategy("MATCH_ALL")\ | |
.setOutputCol("regex_matches")\ | |
.setExternalRules(path=file_for_regex_transformer(), delimiter=',') | |
nlpPipeline = Pipeline(stages=[ | |
documentAssembler, | |
regex_matcher | |
]) | |
return nlpPipeline.fit(rdd_df).transform(rdd_df) \ | |
.select("regex_matches.result") \ | |
.rdd.flatMap(lambda x: x[0])\ | |
.map(lambda s: s.replace(' ', '')) \ | |
.map(lambda s: s.replace('>', '')) \ | |
.map(lambda s: s.replace('>', '')) \ | |
.map(lambda s: s.replace('Kč', ' Kč')) \ | |
.map(lambda s: s.split()) \ | |
.map(lambda s: '{:,} {}'.format(int(s[0]), str(s[1]))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment