-
-
Save yogenderPalChandra/8fbdda8fb1ade0307717250f6759bc5c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def processRecord_udf_param2(file): | |
""" Same function but for Param2 parsing | |
using Beautiful Soup UDF takes class='params2' <tag> | |
""" | |
soup = BeautifulSoup(file, "html.parser") | |
classes = [] | |
for element in soup.find_all('ul', class_='params2'): | |
for il in element.find_all('li'): | |
text = il.get_text(strip=True).replace(u'\xa0', u' ') | |
classes.append(text) | |
return classes | |
def register_apply_udf_param2(rdd_l): | |
"""same apply UDF to rdd_l | |
""" | |
apply2lambdafunc = lambda z: processRecord_udf_param2(z) | |
return remove_fileName_rdd(rdd_l(path), columnName='val'). \ | |
withColumn('cleaned', udf(apply2lambdafunc, StringType())('val')) \ | |
.select("cleaned").rdd.flatMap(lambda x: x) | |
def cleaned2pd_param2(cleaned_rdd_l): | |
"""cleaned tags to df to pandas df | |
""" | |
pd.set_option('display.max_colwidth', None) | |
row = Row("cleanedParams2") | |
return cleaned_rdd_l.map(row).toDF().select('cleanedParams2').toPandas() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment