-
-
Save yogenderPalChandra/7c77d30f4afaa91c2b28930e7f92fade to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def rdd_2_address(rdd_df): | |
'''Takes rdd dataframe cleans it and returns pandas df | |
:rdd_l.map(lambda x: x[1]): takes the data out of tuple of rdd. because rdd when read using WholeTextFile stores | |
tuple(fileNamle, StringData). | |
:.map(lambda x: x.split('\n\t')): split the data by combination of next line and tab seperater. | |
:.map(lambda x: x[6]): Pick up the 6th row from rdds as it is the intended class="location-text ng-binding" | |
:.map(lambda x: x.split('prodeji')): split by prodej to get meaningfull string | |
:.map(lambda x: x[1]): some indexing to pick elemnt out of list of list | |
:.map(lambda x: x.split(';')): split it at ; because thats where address seperates | |
:.map(lambda x: x[0]): again pick element from lol (list of list) or tuple | |
:.map(lambda x: x.strip(' ')): strip out empty space | |
''' | |
rdd1 = rdd_df.map(lambda x: x[1]).map(lambda x: x.split('\n\t')).map(lambda x: x[6]) \ | |
.map(lambda x: x.split('prodeji')).map(lambda x: x[1]).map(lambda x: x.split(';')) \ | |
.map(lambda x: x[0]).map(lambda x: x.strip(' ')).map(lambda x: str(x)).map(lambda p: Row(p)) | |
schemaString = "address" | |
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] | |
schema = StructType(fields) | |
addressSchema = sqlContext.createDataFrame(rdd1, schema) | |
return addressSchema.toPandas() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment