yogenderPalChandra/method2.py Secret

## method2.py
def rdd_2_address(rdd_df):

    '''Takes rdd dataframe cleans it and returns pandas df
    :rdd_l.map(lambda x: x[1]): takes the data out of tuple of rdd. because rdd when read using WholeTextFile stores
    tuple(fileNamle, StringData).
    :.map(lambda x: x.split('\n\t')): split the data by combination of next line and tab seperater.
    :.map(lambda x: x[6]): Pick up the 6th row from rdds as it is the intended class="location-text ng-binding"
    :.map(lambda x: x.split('prodeji')): split by prodej to get meaningfull string
    :.map(lambda x: x[1]): some indexing to pick elemnt out of list of list
    :.map(lambda x: x.split(';')): split it at ; because thats where address seperates
    :.map(lambda x: x[0]): again pick element from lol (list of list) or tuple
    :.map(lambda x: x.strip(' ')): strip out empty space
    '''
    rdd1 = rdd_df.map(lambda x: x[1]).map(lambda x: x.split('\n\t')).map(lambda x: x[6]) \
    .map(lambda x: x.split('prodeji')).map(lambda x: x[1]).map(lambda x: x.split(';')) \
    .map(lambda x: x[0]).map(lambda x: x.strip(' ')).map(lambda x: str(x)).map(lambda p: Row(p))

    schemaString = "address"
    fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
    schema = StructType(fields)
    addressSchema = sqlContext.createDataFrame(rdd1, schema)
    return addressSchema.toPandas()
	def rdd_2_address(rdd_df):

	'''Takes rdd dataframe cleans it and returns pandas df
	:rdd_l.map(lambda x: x[1]): takes the data out of tuple of rdd. because rdd when read using WholeTextFile stores
	tuple(fileNamle, StringData).
	:.map(lambda x: x.split('\n\t')): split the data by combination of next line and tab seperater.
	:.map(lambda x: x[6]): Pick up the 6th row from rdds as it is the intended class="location-text ng-binding"
	:.map(lambda x: x.split('prodeji')): split by prodej to get meaningfull string
	:.map(lambda x: x[1]): some indexing to pick elemnt out of list of list
	:.map(lambda x: x.split(';')): split it at ; because thats where address seperates
	:.map(lambda x: x[0]): again pick element from lol (list of list) or tuple
	:.map(lambda x: x.strip(' ')): strip out empty space
	'''
	rdd1 = rdd_df.map(lambda x: x[1]).map(lambda x: x.split('\n\t')).map(lambda x: x[6]) \
	.map(lambda x: x.split('prodeji')).map(lambda x: x[1]).map(lambda x: x.split(';')) \
	.map(lambda x: x[0]).map(lambda x: x.strip(' ')).map(lambda x: str(x)).map(lambda p: Row(p))

	schemaString = "address"
	fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
	schema = StructType(fields)
	addressSchema = sqlContext.createDataFrame(rdd1, schema)
	return addressSchema.toPandas()