yogenderPalChandra/regexTransformer.py Secret

## regexTransformer.py
import os
def file_for_regex_transformer():
    """Regex rules to file in current directory
    This function returns path to file with rules string for Regex matcher pipline
    in the function nlp_pipline_and_clean(rdd_df)
    """
    rules = '''.\d\&\w+\;\d+&\w+;\d+&\w+;Kč*'''
    with open('regex_rules.txt', 'w') as f:
        f.write(rules)
    return os.path.join(os.getcwd(), "regex_rules.txt")


RegexMatcher().extractParamMap()

def nlp_pipline_and_clean(rdd_df):
    """takes rdd dataframe rdd_df and returns regex matches item i.e. class="norm-price ng-binding
    :DocumentAssembler():  is a sparknlp.base class Transformer
    which takes rdd with input column (setInputCol()) ->test and returns rdd with column 'assembled'
    :RegexMatcher(): is a the Spark NLP transformer which actually does regex matching
    of the string we defined in the previous function.  It takes 'assembled' column as input and returns
    'regex_matches' column
    :nlpPipeline: a pipline is initialised and called on the rdd_df
    """
    documentAssembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("assembled")

    regex_matcher = RegexMatcher()\
        .setInputCols('assembled')\
        .setStrategy("MATCH_ALL")\
        .setOutputCol("regex_matches")\
        .setExternalRules(path=file_for_regex_transformer(), delimiter=',')

    nlpPipeline = Pipeline(stages=[
        documentAssembler,
        regex_matcher
     ])
    return nlpPipeline.fit(rdd_df).transform(rdd_df) \
.select("regex_matches.result") \
.rdd.flatMap(lambda x: x[0])\
.map(lambda s: s.replace('&nbsp;', '')) \
.map(lambda s: s.replace('>', '')) \
.map(lambda s: s.replace('>', '')) \
.map(lambda s: s.replace('Kč', ' Kč')) \
.map(lambda s: s.split()) \
.map(lambda s: '{:,} {}'.format(int(s[0]), str(s[1])))
	import os
	def file_for_regex_transformer():
	"""Regex rules to file in current directory
	This function returns path to file with rules string for Regex matcher pipline
	in the function nlp_pipline_and_clean(rdd_df)
	"""
	rules = '''.\d\&\w+\;\d+&\w+;\d+&\w+;Kč*'''
	with open('regex_rules.txt', 'w') as f:
	f.write(rules)
	return os.path.join(os.getcwd(), "regex_rules.txt")


	RegexMatcher().extractParamMap()

	def nlp_pipline_and_clean(rdd_df):
	"""takes rdd dataframe rdd_df and returns regex matches item i.e. class="norm-price ng-binding
	:DocumentAssembler(): is a sparknlp.base class Transformer
	which takes rdd with input column (setInputCol()) ->test and returns rdd with column 'assembled'
	:RegexMatcher(): is a the Spark NLP transformer which actually does regex matching
	of the string we defined in the previous function. It takes 'assembled' column as input and returns
	'regex_matches' column
	:nlpPipeline: a pipline is initialised and called on the rdd_df
	"""
	documentAssembler = DocumentAssembler()\
	.setInputCol("text")\
	.setOutputCol("assembled")

	regex_matcher = RegexMatcher()\
	.setInputCols('assembled')\
	.setStrategy("MATCH_ALL")\
	.setOutputCol("regex_matches")\
	.setExternalRules(path=file_for_regex_transformer(), delimiter=',')

	nlpPipeline = Pipeline(stages=[
	documentAssembler,
	regex_matcher
	])
	return nlpPipeline.fit(rdd_df).transform(rdd_df) \
	.select("regex_matches.result") \
	.rdd.flatMap(lambda x: x[0])\
	.map(lambda s: s.replace(' ', '')) \
	.map(lambda s: s.replace('>', '')) \
	.map(lambda s: s.replace('>', '')) \
	.map(lambda s: s.replace('Kč', ' Kč')) \
	.map(lambda s: s.split()) \
	.map(lambda s: '{:,} {}'.format(int(s[0]), str(s[1])))