Skip to content

Instantly share code, notes, and snippets.

View pavlov99's full-sized avatar
:shipit:
Coding mode

Kirill Pavlov pavlov99

:shipit:
Coding mode
View GitHub Profile
val teams = sqlContext.table("lookup_example_nhl")
.withColumn("short_name",
when(
locate("New York", $"team") === 1,
regexp_extract($"team", "\\w+$", 0)
).when(
(locate("Devils", $"team") > 0) ||
(locate("Kings", $"team") > 0) ||
(locate("Sharks", $"team") > 0) ||
(locate("Blues", $"team") > 0),
CREATE VIEW IF NOT EXISTS lookup_example_nhl
AS
SELECT *
FROM lookup_example_nhl_ext
WHERE team != 'Team';
sqlContext.table("lookup_example_nhl_ext").limit(2).show()
+-------------+--------+------------------+
| team|division| conference|
+-------------+--------+------------------+
| Team|Division| Conference|
|Boston Bruins|Atlantic|Eastern Conference|
+-------------+--------+------------------+
select * from lookup_example_nhl_ext limit 5;
OK
Boston Bruins Atlantic Eastern Conference
Buffalo Sabres Atlantic Eastern Conference
Detroit Red Wings Atlantic Eastern Conference
Florida Panthers Atlantic Eastern Conference
Montreal Canadiens Atlantic Eastern Conference
Time taken: 0.1 seconds, Fetched: 5 row(s)
CREATE EXTERNAL TABLE IF NOT EXISTS lookup_example_nhl_ext(
team String,
division String,
conference String)
COMMENT 'NHL teams'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION 'hdfs:///user/<user>/lookup-example/nhl-lookup'
@pavlov99
pavlov99 / blog-lookup-table-1-schedule.scala
Last active September 17, 2016 08:38
Medium blog: lookup-table-maintenance-in-hive snippets
val schedule = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("lookup-example/san-jose-schedule-2016-2017.csv")
.select(
to_date(
unix_timestamp($"START_DATE", "MM/dd/yyyy").cast("timestamp")
) as "date",
when(
@pavlov99
pavlov99 / 1-schedule.scala
Last active September 17, 2016 08:27
lookup-table-maintenance-in-hive blog files
val schedule = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("lookup-example/san-jose-schedule-2016-2017.csv")
.select(
to_date(
unix_timestamp($"START_DATE", "MM/dd/yyyy").cast("timestamp")
) as "date",
when(
def combine[T](s: Seq[T]): Seq[Seq[T]] =
for {
len <- 1 to s.length
combinations <- s combinations len
} yield combinations
println(combine(List('a', 'b', 'c')))
@pavlov99
pavlov99 / 0-apache-spark-presentation.md
Last active May 13, 2016 03:38
Apache Spark in data science presentation

This gist consists of Spark presentation examples.

@pavlov99
pavlov99 / gist:369492916e44ddb1de06
Created February 22, 2016 09:57
fix-spark-union.scala
df1.unionAll(df2.select(fd1.columns.map(df1(_)): _*))