Skip to content

Instantly share code, notes, and snippets.

@pavlov99
Last active September 17, 2016 08:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pavlov99/2dd7739c3d6de237b8b39b118332ea06 to your computer and use it in GitHub Desktop.
Save pavlov99/2dd7739c3d6de237b8b39b118332ea06 to your computer and use it in GitHub Desktop.
lookup-table-maintenance-in-hive blog files
val schedule = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("lookup-example/san-jose-schedule-2016-2017.csv")
.select(
to_date(
unix_timestamp($"START_DATE", "MM/dd/yyyy").cast("timestamp")
) as "date",
when(
locate("San Jose", $"SUBJECT") === 1,
regexp_extract($"SUBJECT", "^San Jose at (.*)$", 1)
).otherwise(
regexp_extract($"SUBJECT", "^(.*) at San Jose$", 1)
) as "competitor"
)
CREATE EXTERNAL TABLE IF NOT EXISTS lookup_example_nhl_ext(
team String,
division String,
conference String)
COMMENT 'NHL teams'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION 'hdfs:///user/<user>/lookup-example/nhl-lookup'
TBLPROPERTIES ("skip.header.line.count"="1");
select * from lookup_example_nhl_ext limit 5;
OK
Boston Bruins Atlantic Eastern Conference
Buffalo Sabres Atlantic Eastern Conference
Detroit Red Wings Atlantic Eastern Conference
Florida Panthers Atlantic Eastern Conference
Montreal Canadiens Atlantic Eastern Conference
Time taken: 0.1 seconds, Fetched: 5 row(s)
sqlContext.table("lookup_example_nhl_ext").limit(2).show()
+-------------+--------+------------------+
| team|division| conference|
+-------------+--------+------------------+
| Team|Division| Conference|
|Boston Bruins|Atlantic|Eastern Conference|
+-------------+--------+------------------+
CREATE VIEW IF NOT EXISTS lookup_example_nhl
AS
SELECT *
FROM lookup_example_nhl_ext
WHERE team != 'Team';
val teams = sqlContext.table("lookup_example_nhl")
.withColumn("short_name",
when(
locate("New York", $"team") === 1,
regexp_extract($"team", "\\w+$", 0)
).when(
(locate("Devils", $"team") > 0) ||
(locate("Kings", $"team") > 0) ||
(locate("Sharks", $"team") > 0) ||
(locate("Blues", $"team") > 0),
regexp_extract($"team", "^(.*) \\w+", 1)
).otherwise(regexp_extract($"team", "^\\w+", 0))
)
teams.show()
+--------------------+------------+------------------+------------+
| team| division| conference| short_name|
+--------------------+------------+------------------+------------+
| Boston Bruins| Atlantic|Eastern Conference| Boston|
| Buffalo Sabres| Atlantic|Eastern Conference| Buffalo|
| Detroit Red Wings| Atlantic|Eastern Conference| Detroit|
| Florida Panthers| Atlantic|Eastern Conference| Florida|
| Montreal Canadiens| Atlantic|Eastern Conference| Montreal|
| Ottawa Senators| Atlantic|Eastern Conference| Ottawa|
| Tampa Bay Lightning| Atlantic|Eastern Conference| Tampa|
| Toronto Maple Leafs| Atlantic|Eastern Conference| Toronto|
| Carolina Hurricanes|Metropolitan|Eastern Conference| Carolina|
|Columbus Blue Jac...|Metropolitan|Eastern Conference| Columbus|
+--------------------+------------+------------------+------------+
import org.apache.spark.sql.Window
val competitorWindow = Window
.partitionBy("date", "competitor")
.orderBy(levenshtein($"competitor", $"short_name"))
val scheduleRich = schedule
.join(
teams, levenshtein($"competitor", $"short_name") < 5, "left_outer"
)
.withColumn("_rank", row_number().over(competitorWindow))
.filter($"_rank" === 1)
.drop("_rank")
scheduleRich.drop("short_name").show(2)
+----------+----------+---------------+--------+------------------+
| date|competitor| team|division| conference|
+----------+----------+---------------+--------+------------------+
|2016-10-05| Anaheim| Anaheim Ducks| Pacific|Western Conference|
|2016-10-09| Anaheim| Anaheim Ducks| Pacific|Western Conference|
+----------+----------+---------------+--------+------------------+
scheduleRich.groupBy("division").count().orderBy($"count".desc).show()
+------------+-----+
| division|count|
+------------+-----+
| Pacific| 36|
| Central| 20|
| Atlantic| 16|
|Metropolitan| 16|
+------------+-----+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment