Skip to content

Instantly share code, notes, and snippets.

View pavlov99's full-sized avatar
:shipit:
Coding mode

Kirill Pavlov pavlov99

:shipit:
Coding mode
View GitHub Profile
def combine[T](s: Seq[T]): Seq[Seq[T]] =
for {
len <- 1 to s.length
combinations <- s combinations len
} yield combinations
println(combine(List('a', 'b', 'c')))
@pavlov99
pavlov99 / 1-schedule.scala
Last active September 17, 2016 08:27
lookup-table-maintenance-in-hive blog files
val schedule = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("lookup-example/san-jose-schedule-2016-2017.csv")
.select(
to_date(
unix_timestamp($"START_DATE", "MM/dd/yyyy").cast("timestamp")
) as "date",
when(
@pavlov99
pavlov99 / blog-lookup-table-1-schedule.scala
Last active September 17, 2016 08:38
Medium blog: lookup-table-maintenance-in-hive snippets
val schedule = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("lookup-example/san-jose-schedule-2016-2017.csv")
.select(
to_date(
unix_timestamp($"START_DATE", "MM/dd/yyyy").cast("timestamp")
) as "date",
when(
CREATE EXTERNAL TABLE IF NOT EXISTS lookup_example_nhl_ext(
team String,
division String,
conference String)
COMMENT 'NHL teams'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION 'hdfs:///user/<user>/lookup-example/nhl-lookup'
select * from lookup_example_nhl_ext limit 5;
OK
Boston Bruins Atlantic Eastern Conference
Buffalo Sabres Atlantic Eastern Conference
Detroit Red Wings Atlantic Eastern Conference
Florida Panthers Atlantic Eastern Conference
Montreal Canadiens Atlantic Eastern Conference
Time taken: 0.1 seconds, Fetched: 5 row(s)
sqlContext.table("lookup_example_nhl_ext").limit(2).show()
+-------------+--------+------------------+
| team|division| conference|
+-------------+--------+------------------+
| Team|Division| Conference|
|Boston Bruins|Atlantic|Eastern Conference|
+-------------+--------+------------------+
CREATE VIEW IF NOT EXISTS lookup_example_nhl
AS
SELECT *
FROM lookup_example_nhl_ext
WHERE team != 'Team';
val teams = sqlContext.table("lookup_example_nhl")
.withColumn("short_name",
when(
locate("New York", $"team") === 1,
regexp_extract($"team", "\\w+$", 0)
).when(
(locate("Devils", $"team") > 0) ||
(locate("Kings", $"team") > 0) ||
(locate("Sharks", $"team") > 0) ||
(locate("Blues", $"team") > 0),
import org.apache.spark.sql.Window
val competitorWindow = Window
.partitionBy("date", "competitor")
.orderBy(levenshtein($"competitor", $"short_name"))
val scheduleRich = schedule
.join(
teams, levenshtein($"competitor", $"short_name") < 5, "left_outer"
)
scheduleRich.groupBy("division").count().orderBy($"count".desc).show()
+------------+-----+
| division|count|
+------------+-----+
| Pacific| 36|
| Central| 20|
| Atlantic| 16|
|Metropolitan| 16|
+------------+-----+