Lakshay lakshay-arora

## lazy_2.py
# add value 4 to each number
rdd_1 = rdd_0.map(lambda x : x+4)

# RDD object
print(rdd_1)

# get the RDD Lineage
print(rdd_1.toDebugString())

## lazy_3.py
# add value 20 each number
rdd_2 = rdd_1.map(lambda x : x+20)

# RDD Object
print(rdd_2)

# get the RDD Lineage
print(rdd_2.toDebugString())

## lazy_2_1.py
# create a RDD of the text file with Number of Partitions = 4
my_text_file = sc.textFile('tokens_spark.txt',minPartitions=4)

# RDD Object
print(my_text_file)

# convert to lower case
my_text_file = my_text_file.map(lambda x : x.lower())

# Updated RDD Object

## lazy_2_2.py
# slice the words
my_text_file = my_text_file.map(lambda x : x[:2])

# RDD Object
print(my_text_file)

# Get the RDD Lineage
print(my_text_file.toDebugString())

# Get the first element after all the transformations

## lazy_2_3.py
print(my_text_file.countApproxDistinct())

## local_vectors.py
from pyspark.mllib.linalg import Vectors

## Dense Vector
print(Vectors.dense([1,2,3,4,5,6,0]))
# >> DenseVector([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0])

### SPARSE VECTOR
### Vectors.sparse( length, index_of_non_zero_values, non_zero_values)
### Indices values should be strictly increasing

## labeled_point.py
from pyspark.mllib.regression import LabeledPoint

# set a Label against a Dense Vector
point_1 = LabeledPoint(1,Vectors.dense([1,2,3,4,5]))

# Features
print(point_1.features)

# Label
print(point_1.label)

## labeled_point_sparse.py
# define a labeled point
point_2 = LabeledPoint(1,Vectors.sparse(10, [0,1,6], [2,4,5]))

# features of labeled point
print(point_2.features)
# >> SparseVector(10, {0: 2.0, 1: 4.0, 6: 5.0})

# label
print(point_2.label)
# >> 1.0

## row_matrix.py
# Distributed Data Type - Row Matrix
from pyspark.mllib.linalg.distributed import RowMatrix

# create RDD
rows = sc.parallelize([[1,2,3], [4,5,6], [7,8,9], [10,11,12]])

# create a distributed Row Matrix
row_matrix = RowMatrix(rows)


## indexed_row_matrix.py
# Indexed Row Matrix

from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

# create RDD
indexed_rows = sc.parallelize([
    IndexedRow(0, [0,1,2]),
    IndexedRow(1, [1,2,3]),
    IndexedRow(2, [3,4,5]),
    IndexedRow(3, [4,2,3]),
	# add value 4 to each number
	rdd_1 = rdd_0.map(lambda x : x+4)

	# RDD object
	print(rdd_1)

	# get the RDD Lineage
	print(rdd_1.toDebugString())
	# add value 20 each number
	rdd_2 = rdd_1.map(lambda x : x+20)

	# RDD Object
	print(rdd_2)

	# get the RDD Lineage
	print(rdd_2.toDebugString())
	# create a RDD of the text file with Number of Partitions = 4
	my_text_file = sc.textFile('tokens_spark.txt',minPartitions=4)

	# RDD Object
	print(my_text_file)

	# convert to lower case
	my_text_file = my_text_file.map(lambda x : x.lower())

	# Updated RDD Object
	# slice the words
	my_text_file = my_text_file.map(lambda x : x[:2])

	# RDD Object
	print(my_text_file)

	# Get the RDD Lineage
	print(my_text_file.toDebugString())

	# Get the first element after all the transformations
	from pyspark.mllib.linalg import Vectors

	## Dense Vector
	print(Vectors.dense([1,2,3,4,5,6,0]))
	# >> DenseVector([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0])

	### SPARSE VECTOR
	### Vectors.sparse( length, index_of_non_zero_values, non_zero_values)
	### Indices values should be strictly increasing
	from pyspark.mllib.regression import LabeledPoint

	# set a Label against a Dense Vector
	point_1 = LabeledPoint(1,Vectors.dense([1,2,3,4,5]))

	# Features
	print(point_1.features)

	# Label
	print(point_1.label)
	# define a labeled point
	point_2 = LabeledPoint(1,Vectors.sparse(10, [0,1,6], [2,4,5]))

	# features of labeled point
	print(point_2.features)
	# >> SparseVector(10, {0: 2.0, 1: 4.0, 6: 5.0})

	# label
	print(point_2.label)
	# >> 1.0
	# Distributed Data Type - Row Matrix
	from pyspark.mllib.linalg.distributed import RowMatrix

	# create RDD
	rows = sc.parallelize([[1,2,3], [4,5,6], [7,8,9], [10,11,12]])

	# create a distributed Row Matrix
	row_matrix = RowMatrix(rows)
	# Indexed Row Matrix

	from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

	# create RDD
	indexed_rows = sc.parallelize([
	IndexedRow(0, [0,1,2]),
	IndexedRow(1, [1,2,3]),
	IndexedRow(2, [3,4,5]),
	IndexedRow(3, [4,2,3]),