yogenderPalChandra/param1.py Secret

## param1.py

import pandas as pd
from bs4 import BeautifulSoup
from pyspark.sql.types import *
from pyspark.sql.functions import udf

def remove_fileName_rdd(rdd_l, columnName=None):
    """Takes rdd_l and change the name according to what we provide
    Also, indexes out the data.
    """
    from pyspark.sql import Row
    columnName = str(columnName)
    row = Row(columnName) # Or some other column name
    return rdd_l.map(lambda x: x[1]).map(row).toDF()


def processRecord_udf_param1(rdd_l):
    """ Beautiful Soup UDF takes rdd_l parses  needed HTML <tag>:class="params1"
    """
    soup = BeautifulSoup(rdd_l, "html.parser")
    classes = []
    for element in soup.find_all('ul', class_='params1'):
        for il in element.find_all('li'):
            text = il.get_text(strip=True).replace(u'\xa0', u' ')
            classes.append(text)

    return classes

def register_apply_udf(rdd_l):
    """ this function applies the UDF UDF to the rdd_l
    """
    apply2lambdafunc = lambda z: processRecord_udf_param1(z)

    return remove_fileName_rdd(rdd_l(path), columnName='val'). \
withColumn('cleaned', udf(apply2lambdafunc, StringType())('val')) \
.select("cleaned").rdd.flatMap(lambda x: x)

def cleaned2pd_param1(cleaned_rdd_l):
    """convert the parsed information i.e. parama1 to rdd df and then to Pandas df
    """
    pd.set_option('display.max_colwidth', None)
    row = Row("cleanedParams1")
    return cleaned_rdd_l.map(row).toDF().select('cleanedParams1').toPandas()

	import pandas as pd
	from bs4 import BeautifulSoup
	from pyspark.sql.types import *
	from pyspark.sql.functions import udf

	def remove_fileName_rdd(rdd_l, columnName=None):
	"""Takes rdd_l and change the name according to what we provide
	Also, indexes out the data.
	"""
	from pyspark.sql import Row
	columnName = str(columnName)
	row = Row(columnName) # Or some other column name
	return rdd_l.map(lambda x: x[1]).map(row).toDF()



	def processRecord_udf_param1(rdd_l):
	""" Beautiful Soup UDF takes rdd_l parses needed HTML <tag>:class="params1"
	"""
	soup = BeautifulSoup(rdd_l, "html.parser")
	classes = []
	for element in soup.find_all('ul', class_='params1'):
	for il in element.find_all('li'):
	text = il.get_text(strip=True).replace(u'\xa0', u' ')
	classes.append(text)

	return classes

	def register_apply_udf(rdd_l):
	""" this function applies the UDF UDF to the rdd_l
	"""
	apply2lambdafunc = lambda z: processRecord_udf_param1(z)

	return remove_fileName_rdd(rdd_l(path), columnName='val'). \
	withColumn('cleaned', udf(apply2lambdafunc, StringType())('val')) \
	.select("cleaned").rdd.flatMap(lambda x: x)

	def cleaned2pd_param1(cleaned_rdd_l):
	"""convert the parsed information i.e. parama1 to rdd df and then to Pandas df
	"""
	pd.set_option('display.max_colwidth', None)
	row = Row("cleanedParams1")
	return cleaned_rdd_l.map(row).toDF().select('cleanedParams1').toPandas()