Last active
September 17, 2016 08:27
-
-
Save pavlov99/2dd7739c3d6de237b8b39b118332ea06 to your computer and use it in GitHub Desktop.
lookup-table-maintenance-in-hive blog files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val schedule = sqlContext.read | |
.format("com.databricks.spark.csv") | |
.option("header", "true") | |
.option("inferSchema", "true") | |
.load("lookup-example/san-jose-schedule-2016-2017.csv") | |
.select( | |
to_date( | |
unix_timestamp($"START_DATE", "MM/dd/yyyy").cast("timestamp") | |
) as "date", | |
when( | |
locate("San Jose", $"SUBJECT") === 1, | |
regexp_extract($"SUBJECT", "^San Jose at (.*)$", 1) | |
).otherwise( | |
regexp_extract($"SUBJECT", "^(.*) at San Jose$", 1) | |
) as "competitor" | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTERNAL TABLE IF NOT EXISTS lookup_example_nhl_ext( | |
team String, | |
division String, | |
conference String) | |
COMMENT 'NHL teams' | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY ',' | |
LINES TERMINATED BY '\n' | |
STORED AS TEXTFILE | |
LOCATION 'hdfs:///user/<user>/lookup-example/nhl-lookup' | |
TBLPROPERTIES ("skip.header.line.count"="1"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
select * from lookup_example_nhl_ext limit 5; | |
OK | |
Boston Bruins Atlantic Eastern Conference | |
Buffalo Sabres Atlantic Eastern Conference | |
Detroit Red Wings Atlantic Eastern Conference | |
Florida Panthers Atlantic Eastern Conference | |
Montreal Canadiens Atlantic Eastern Conference | |
Time taken: 0.1 seconds, Fetched: 5 row(s) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sqlContext.table("lookup_example_nhl_ext").limit(2).show() | |
+-------------+--------+------------------+ | |
| team|division| conference| | |
+-------------+--------+------------------+ | |
| Team|Division| Conference| | |
|Boston Bruins|Atlantic|Eastern Conference| | |
+-------------+--------+------------------+ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE VIEW IF NOT EXISTS lookup_example_nhl | |
AS | |
SELECT * | |
FROM lookup_example_nhl_ext | |
WHERE team != 'Team'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val teams = sqlContext.table("lookup_example_nhl") | |
.withColumn("short_name", | |
when( | |
locate("New York", $"team") === 1, | |
regexp_extract($"team", "\\w+$", 0) | |
).when( | |
(locate("Devils", $"team") > 0) || | |
(locate("Kings", $"team") > 0) || | |
(locate("Sharks", $"team") > 0) || | |
(locate("Blues", $"team") > 0), | |
regexp_extract($"team", "^(.*) \\w+", 1) | |
).otherwise(regexp_extract($"team", "^\\w+", 0)) | |
) | |
teams.show() | |
+--------------------+------------+------------------+------------+ | |
| team| division| conference| short_name| | |
+--------------------+------------+------------------+------------+ | |
| Boston Bruins| Atlantic|Eastern Conference| Boston| | |
| Buffalo Sabres| Atlantic|Eastern Conference| Buffalo| | |
| Detroit Red Wings| Atlantic|Eastern Conference| Detroit| | |
| Florida Panthers| Atlantic|Eastern Conference| Florida| | |
| Montreal Canadiens| Atlantic|Eastern Conference| Montreal| | |
| Ottawa Senators| Atlantic|Eastern Conference| Ottawa| | |
| Tampa Bay Lightning| Atlantic|Eastern Conference| Tampa| | |
| Toronto Maple Leafs| Atlantic|Eastern Conference| Toronto| | |
| Carolina Hurricanes|Metropolitan|Eastern Conference| Carolina| | |
|Columbus Blue Jac...|Metropolitan|Eastern Conference| Columbus| | |
+--------------------+------------+------------------+------------+ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.spark.sql.Window | |
val competitorWindow = Window | |
.partitionBy("date", "competitor") | |
.orderBy(levenshtein($"competitor", $"short_name")) | |
val scheduleRich = schedule | |
.join( | |
teams, levenshtein($"competitor", $"short_name") < 5, "left_outer" | |
) | |
.withColumn("_rank", row_number().over(competitorWindow)) | |
.filter($"_rank" === 1) | |
.drop("_rank") | |
scheduleRich.drop("short_name").show(2) | |
+----------+----------+---------------+--------+------------------+ | |
| date|competitor| team|division| conference| | |
+----------+----------+---------------+--------+------------------+ | |
|2016-10-05| Anaheim| Anaheim Ducks| Pacific|Western Conference| | |
|2016-10-09| Anaheim| Anaheim Ducks| Pacific|Western Conference| | |
+----------+----------+---------------+--------+------------------+ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scheduleRich.groupBy("division").count().orderBy($"count".desc).show() | |
+------------+-----+ | |
| division|count| | |
+------------+-----+ | |
| Pacific| 36| | |
| Central| 20| | |
| Atlantic| 16| | |
|Metropolitan| 16| | |
+------------+-----+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment