Adrian Chang adrian-chang

## partition_data.txt
[user@server ~]$ hdfs dfs -du -h ..
121.7 M  ../year=2017/processed/part-00000-4e56be16-aa32-43fa-ad2a-3668b1240f4e-c000.snappy.parquet
100.2 M  ../year=2017/processed/part-00001-4e56be16-aa32-43fa-ad2a-3668b1240f4e-c000.snappy.parquet
123.6 M  ../year=2017/processed/part-00002-4e56be16-aa32-43fa-ad2a-3668b1240f4e-c000.snappy.parquet
....

## apacheSparkRepartitioningData.scala
// default data with 6 partitions
val colleges = Seq(
    (1, "Harvard"),
    (1, "Stanford"),
    (2, "MIT"),
    (2, "UC Berkley"),
    (3, "University of Texas"),
    (3, "Columbia"),
    (4, "University of Washington"),
    (4, "Georgia Tech")

## apacheSparkRepartitioningCoalesce.scala
// coalesce down to 2 partitions
val collegeCoalesce = colleges.coalesce(2).rdd.mapPartitionsWithIndex((index, iterator) => {
  iterator.map(college => (index, college))
}).collect().foreach(println(_))

/*
(0,[4,University of Washington])
(0,[1,Harvard])
(0,[4,Georgia Tech])
(0,[1,Stanford])

## apacheSparkRepartitioningRepartition.scala
// repartition down to 2 partitions
val collegeRepartition = colleges.repartition(2).rdd.mapPartitionsWithIndex((index, iterator) => {
  iterator.map(college => (index, college))
}).collect().foreach(println(_))

/*
(0,[1,Harvard])
(0,[2,MIT])
(0,[3,University of Texas])
(0,[4,University of Washington])

## student_majors_cross_join.txt
students.crossJoin(majors).show()

+----------+------------+----------+----------------+
|student_id|student_name|student_id|           major|
+----------+------------+----------+----------------+
|         1|        John|         2|Computer Science|
|         1|        John|         3|         History|
|         2|        Bill|         2|Computer Science|
|         2|        Bill|         3|         History|
|         3|        Mary|         2|Computer Science|

## student_majors_inner_join.txt
 students.join(majors, Seq("student_id"), "inner").show()

+----------+------------+----------------+
|student_id|student_name|           major|
+----------+------------+----------------+
|         2|        Bill|Computer Science|
|         3|        Mary|         History|
+----------+------------+----------------+

## majors.txt
val majors = Seq(
    (2, "Computer Science"),
    (3, "History")
).toDF("student_id", "major")

majors: org.apache.spark.sql.DataFrame = [student_id: int, major: string]

majors.show()

+----------+----------------+

## colleges.txt
val colleges = Seq(
    (1, "Harvard"),
    (1, "Stanford"),
    (3, "University of Texas"),
    (3, "Columbia"),
    (4, "University of Washington"),
    (4, "Georgia Tech")
).toDF("student_id", "college_name")

colleges: org.apache.spark.sql.DataFrame = [student_id: int, college_name: string]

## students.txt
val students = Seq(
	(1, "John"),
	(2, "Bill"),
	(3, "Mary"),
	(4, "Jane")
).toDF("student_id", "student_name")

students: org.apache.spark.sql.DataFrame = [student_id: int, student_name: string]

students.show()

## plan_checkpoint.txt
== Parsed Logical Plan ==
Relation[domain#23250,twenty_four_months#23251
	[user@server ~]$ hdfs dfs -du -h ..
	121.7 M ../year=2017/processed/part-00000-4e56be16-aa32-43fa-ad2a-3668b1240f4e-c000.snappy.parquet
	100.2 M ../year=2017/processed/part-00001-4e56be16-aa32-43fa-ad2a-3668b1240f4e-c000.snappy.parquet
	123.6 M ../year=2017/processed/part-00002-4e56be16-aa32-43fa-ad2a-3668b1240f4e-c000.snappy.parquet
	....
	// default data with 6 partitions
	val colleges = Seq(
	(1, "Harvard"),
	(1, "Stanford"),
	(2, "MIT"),
	(2, "UC Berkley"),
	(3, "University of Texas"),
	(3, "Columbia"),
	(4, "University of Washington"),
	(4, "Georgia Tech")
	// coalesce down to 2 partitions
	val collegeCoalesce = colleges.coalesce(2).rdd.mapPartitionsWithIndex((index, iterator) => {
	iterator.map(college => (index, college))
	}).collect().foreach(println(_))

	/*
	(0,[4,University of Washington])
	(0,[1,Harvard])
	(0,[4,Georgia Tech])
	(0,[1,Stanford])
	// repartition down to 2 partitions
	val collegeRepartition = colleges.repartition(2).rdd.mapPartitionsWithIndex((index, iterator) => {
	iterator.map(college => (index, college))
	}).collect().foreach(println(_))

	/*
	(0,[1,Harvard])
	(0,[2,MIT])
	(0,[3,University of Texas])
	(0,[4,University of Washington])
	students.crossJoin(majors).show()

	+----------+------------+----------+----------------+
	\|student_id\|student_name\|student_id\| major\|
	+----------+------------+----------+----------------+
	\| 1\| John\| 2\|Computer Science\|
	\| 1\| John\| 3\| History\|
	\| 2\| Bill\| 2\|Computer Science\|
	\| 2\| Bill\| 3\| History\|
	\| 3\| Mary\| 2\|Computer Science\|
	students.join(majors, Seq("student_id"), "inner").show()

	+----------+------------+----------------+
	\|student_id\|student_name\| major\|
	+----------+------------+----------------+
	\| 2\| Bill\|Computer Science\|
	\| 3\| Mary\| History\|
	+----------+------------+----------------+
	val majors = Seq(
	(2, "Computer Science"),
	(3, "History")
	).toDF("student_id", "major")

	majors: org.apache.spark.sql.DataFrame = [student_id: int, major: string]

	majors.show()

	+----------+----------------+
	val students = Seq(
	(1, "John"),
	(2, "Bill"),
	(3, "Mary"),
	(4, "Jane")
	).toDF("student_id", "student_name")

	students: org.apache.spark.sql.DataFrame = [student_id: int, student_name: string]

	students.show()
	== Parsed Logical Plan ==
	Relation[domain#23250,twenty_four_months#23251