Skip to content

Instantly share code, notes, and snippets.

View lakshay-arora's full-sized avatar
🇮🇳

Lakshay lakshay-arora

🇮🇳
  • Walmart
  • Bengaluru
View GitHub Profile
# add value 4 to each number
rdd_1 = rdd_0.map(lambda x : x+4)
# RDD object
print(rdd_1)
# get the RDD Lineage
print(rdd_1.toDebugString())
# add value 20 each number
rdd_2 = rdd_1.map(lambda x : x+20)
# RDD Object
print(rdd_2)
# get the RDD Lineage
print(rdd_2.toDebugString())
# create a RDD of the text file with Number of Partitions = 4
my_text_file = sc.textFile('tokens_spark.txt',minPartitions=4)
# RDD Object
print(my_text_file)
# convert to lower case
my_text_file = my_text_file.map(lambda x : x.lower())
# Updated RDD Object
# slice the words
my_text_file = my_text_file.map(lambda x : x[:2])
# RDD Object
print(my_text_file)
# Get the RDD Lineage
print(my_text_file.toDebugString())
# Get the first element after all the transformations
print(my_text_file.countApproxDistinct())
from pyspark.mllib.linalg import Vectors
## Dense Vector
print(Vectors.dense([1,2,3,4,5,6,0]))
# >> DenseVector([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0])
### SPARSE VECTOR
### Vectors.sparse( length, index_of_non_zero_values, non_zero_values)
### Indices values should be strictly increasing
from pyspark.mllib.regression import LabeledPoint
# set a Label against a Dense Vector
point_1 = LabeledPoint(1,Vectors.dense([1,2,3,4,5]))
# Features
print(point_1.features)
# Label
print(point_1.label)
# define a labeled point
point_2 = LabeledPoint(1,Vectors.sparse(10, [0,1,6], [2,4,5]))
# features of labeled point
print(point_2.features)
# >> SparseVector(10, {0: 2.0, 1: 4.0, 6: 5.0})
# label
print(point_2.label)
# >> 1.0
# Distributed Data Type - Row Matrix
from pyspark.mllib.linalg.distributed import RowMatrix
# create RDD
rows = sc.parallelize([[1,2,3], [4,5,6], [7,8,9], [10,11,12]])
# create a distributed Row Matrix
row_matrix = RowMatrix(rows)
# Indexed Row Matrix
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
# create RDD
indexed_rows = sc.parallelize([
IndexedRow(0, [0,1,2]),
IndexedRow(1, [1,2,3]),
IndexedRow(2, [3,4,5]),
IndexedRow(3, [4,2,3]),