Skip to content

Instantly share code, notes, and snippets.

@yogenderPalChandra
Created July 3, 2022 09:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yogenderPalChandra/8fbdda8fb1ade0307717250f6759bc5c to your computer and use it in GitHub Desktop.
Save yogenderPalChandra/8fbdda8fb1ade0307717250f6759bc5c to your computer and use it in GitHub Desktop.
def processRecord_udf_param2(file):
""" Same function but for Param2 parsing
using Beautiful Soup UDF takes class='params2' <tag>
"""
soup = BeautifulSoup(file, "html.parser")
classes = []
for element in soup.find_all('ul', class_='params2'):
for il in element.find_all('li'):
text = il.get_text(strip=True).replace(u'\xa0', u' ')
classes.append(text)
return classes
def register_apply_udf_param2(rdd_l):
"""same apply UDF to rdd_l
"""
apply2lambdafunc = lambda z: processRecord_udf_param2(z)
return remove_fileName_rdd(rdd_l(path), columnName='val'). \
withColumn('cleaned', udf(apply2lambdafunc, StringType())('val')) \
.select("cleaned").rdd.flatMap(lambda x: x)
def cleaned2pd_param2(cleaned_rdd_l):
"""cleaned tags to df to pandas df
"""
pd.set_option('display.max_colwidth', None)
row = Row("cleanedParams2")
return cleaned_rdd_l.map(row).toDF().select('cleanedParams2').toPandas()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment