Skip to content

Instantly share code, notes, and snippets.

View pavlov99's full-sized avatar
:shipit:
Coding mode

Kirill Pavlov pavlov99

:shipit:
Coding mode
View GitHub Profile
select * from lookup_example_nhl_ext limit 5;
OK
Boston Bruins Atlantic Eastern Conference
Buffalo Sabres Atlantic Eastern Conference
Detroit Red Wings Atlantic Eastern Conference
Florida Panthers Atlantic Eastern Conference
Montreal Canadiens Atlantic Eastern Conference
Time taken: 0.1 seconds, Fetched: 5 row(s)
sqlContext.table("lookup_example_nhl_ext").limit(2).show()
+-------------+--------+------------------+
| team|division| conference|
+-------------+--------+------------------+
| Team|Division| Conference|
|Boston Bruins|Atlantic|Eastern Conference|
+-------------+--------+------------------+
CREATE VIEW IF NOT EXISTS lookup_example_nhl
AS
SELECT *
FROM lookup_example_nhl_ext
WHERE team != 'Team';
val teams = sqlContext.table("lookup_example_nhl")
.withColumn("short_name",
when(
locate("New York", $"team") === 1,
regexp_extract($"team", "\\w+$", 0)
).when(
(locate("Devils", $"team") > 0) ||
(locate("Kings", $"team") > 0) ||
(locate("Sharks", $"team") > 0) ||
(locate("Blues", $"team") > 0),
import org.apache.spark.sql.Window
val competitorWindow = Window
.partitionBy("date", "competitor")
.orderBy(levenshtein($"competitor", $"short_name"))
val scheduleRich = schedule
.join(
teams, levenshtein($"competitor", $"short_name") < 5, "left_outer"
)
scheduleRich.groupBy("division").count().orderBy($"count".desc).show()
+------------+-----+
| division|count|
+------------+-----+
| Pacific| 36|
| Central| 20|
| Atlantic| 16|
|Metropolitan| 16|
+------------+-----+
@pavlov99
pavlov99 / apache-spark-boolean-operations.scala
Last active October 3, 2016 02:55
Apache Spark: boolean operations with null handling
sc.parallelize(Array[(Int, Option[Boolean])](
(0, Some(true)), (1, Some(false)), (3, null)
)).toDF("id", "column")
.withColumn("notColumn", !$"column")
.withColumn("andNull", $"column" && null)
.withColumn("orNull", $"column" || null)
.withColumn("andFalse", $"column" && false)
.withColumn("orFalse", $"column" || false)
.withColumn("andTrue", $"column" && true)
.withColumn("orTrue", $"column" || true)
hive -e 'set hive.cli.print.header=true; select * from table_name' | tr "\t" "," > ~/table_name.csv
df.select("columnName").collect().map(_.getString(0)).sorted
@pavlov99
pavlov99 / group-overlap-detection.scala
Last active January 4, 2017 05:55
Check groups overlap
// Data example:
// id group
// 1 A
// 2 A
// 2 B
// In this case object `2` belongs to both groups "A" and "B"
val overlappedGroups = groups.select($"id", $"group" as "_group")
groups
.join(overlappedGroups, (groups("id") === overlappedGroups("id")) && ($"group" < $"_group")) // NOTE: group A < group B, so duplicates (A,B) (B,A) would be removed.