Created
January 6, 2016 16:48
-
-
Save tmcgrath/866376273797ea4c2900 to your computer and use it in GitHub Desktop.
Spark SQL with Scala using CSV input data source in spark console
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
todd-mcgraths-macbook-pro:spark-1.4.1-bin-hadoop2.4 toddmcgrath$ bin/spark-shell --packages com.databricks:spark-csv_2.10:1.3.0 | |
Ivy Default Cache set to: /Users/toddmcgrath/.ivy2/cache | |
The jars for the packages stored in: /Users/toddmcgrath/.ivy2/jars | |
:: loading settings :: url = jar:file:/Users/toddmcgrath/Development/spark-1.4.1-bin-hadoop2.4/lib/spark-assembly-1.4.1-hadoop2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml | |
com.databricks#spark-csv_2.10 added as a dependency | |
:: resolving dependencies :: org.apache.spark#spark-submit-parent;1.0 | |
confs: [default] | |
found com.databricks#spark-csv_2.10;1.3.0 in central | |
found org.apache.commons#commons-csv;1.1 in central | |
found com.univocity#univocity-parsers;1.5.1 in central | |
:: resolution report :: resolve 285ms :: artifacts dl 10ms | |
:: modules in use: | |
com.databricks#spark-csv_2.10;1.3.0 from central in [default] | |
com.univocity#univocity-parsers;1.5.1 from central in [default] | |
org.apache.commons#commons-csv;1.1 from central in [default] | |
--------------------------------------------------------------------- | |
| | modules || artifacts | | |
| conf | number| search|dwnlded|evicted|| number|dwnlded| | |
--------------------------------------------------------------------- | |
| default | 3 | 0 | 0 | 0 || 3 | 0 | | |
--------------------------------------------------------------------- | |
:: retrieving :: org.apache.spark#spark-submit-parent | |
confs: [default] | |
0 artifacts copied, 3 already retrieved (0kB/5ms) | |
2016-01-06 10:44:50.445 java[20332:1203] Unable to load realm info from SCDynamicStore | |
Welcome to | |
____ __ | |
/ __/__ ___ _____/ /__ | |
_\ \/ _ \/ _ `/ __/ '_/ | |
/___/ .__/\_,_/_/ /_/\_\ version 1.4.1 | |
/_/ | |
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.6.0_65) | |
Type in expressions to have them evaluated. | |
Type :help for more information. | |
Spark context available as sc. | |
SQL context available as sqlContext. | |
scala> val baby_names = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("baby_names.csv") | |
2016-01-06 10:45:36.961 java[20332:1203] Unable to load realm info from SCDynamicStore | |
baby_names: org.apache.spark.sql.DataFrame = [Year: int, First Name: string, County: string, Sex: string, Count: int] | |
scala> baby_names.registerTempTable("names") | |
scala> val distinctYears = sqlContext.sql("select distinct Year from names") | |
distinctYears: org.apache.spark.sql.DataFrame = [Year: int] | |
scala> distinctYears.collect.foreach(println) | |
[2007] | |
[2008] | |
[2009] | |
[2010] | |
[2011] | |
[2012] | |
scala> baby_names.printSchema | |
root | |
|-- Year: integer (nullable = true) | |
|-- First Name: string (nullable = true) | |
|-- County: string (nullable = true) | |
|-- Sex: string (nullable = true) | |
|-- Count: integer (nullable = true) | |
scala> val popular_names = sqlContext.sql("select distinct(`First Name`), count(County) as cnt from names group by `First Name` order by cnt desc LIMIT 10") | |
popular_names: org.apache.spark.sql.DataFrame = [First Name: string, cnt: bigint] | |
scala> popular_names.collect.foreach(println) | |
[JACOB,237] | |
[EMMA,223] | |
[LOGAN,220] | |
[OLIVIA,217] | |
[ISABELLA,209] | |
[SOPHIA,200] | |
[NOAH,197] | |
[ETHAN,195] | |
[MASON,194] | |
[MICHAEL,194] | |
scala> val popular_names = sqlContext.sql("select distinct(`First Name`), sum(Count) as cnt from names group by `First Name` order by cnt desc LIMIT 10") | |
popular_names: org.apache.spark.sql.DataFrame = [First Name: string, cnt: bigint] | |
scala> popular_names.collect.foreach(println) | |
[MICHAEL,9187] | |
[MATTHEW,7891] | |
[JAYDEN,7807] | |
[ISABELLA,7782] | |
[JOSEPH,7609] | |
[JACOB,7444] | |
[ANTHONY,7427] | |
[DANIEL,7313] | |
[SOPHIA,7274] | |
[RYAN,7172] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
baby_names.csv example:
Year,First Name,County,Sex,Count
2012,DOMINIC,CAYUGA,M,6
2012,ADDISON,ONONDAGA,F,14
2012,JULIA,ONONDAGA,F,15
2012,WILLIAM,WESTCHESTER,M,41
For more info see Spark SQL CSV Examples