Created
March 22, 2016 03:12
-
-
Save asimjalis/e6fc6a559c04e945a8a8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Apache Toree Demo" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Simple Spark Scala\n", | |
"\n", | |
"Test notebook with simple Spark Scala code.\n", | |
"\n", | |
"Take numbers 1 to 100, keep the numbers that are even, square them, and keep the first 10." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"org.apache.spark.SparkContext@1b0709e7" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sc" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"attributes": { | |
"classes": [ | |
"scala" | |
], | |
"id": "" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Array(9, 36, 81, 144, 225, 324, 441, 576, 729, 900)" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sc.parallelize(1 to 100).\n", | |
" filter(x => x % 3 == 0).\n", | |
" map(x => x * x).\n", | |
" take(10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Use tab for auto-complete." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Test CSV Library" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Useful functions\n", | |
"\n", | |
"Define some functions." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"// Grab URL contents\n", | |
"def getUrl(url:String):String = \n", | |
" scala.io.Source.fromURL(url).mkString\n", | |
"\n", | |
"// Write file\n", | |
"def fileWrite(path:String,contents:String) = {\n", | |
" import java.io.{PrintWriter,File}\n", | |
" val writer = new PrintWriter(new File(path))\n", | |
" writer.write(contents)\n", | |
" writer.close\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Download Prices\n", | |
"\n", | |
"Get the historical stock price of AAPL and save it in AAPL.csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"AAPL.csv\n" | |
] | |
} | |
], | |
"source": [ | |
"val symbol = \"AAPL\"\n", | |
"val baseUrl = \"http://real-chart.finance.yahoo.com\"\n", | |
"val url = s\"${baseUrl}/table.csv?s=${symbol}&g=d&ignore=.csv\"\n", | |
"val csv = getUrl(url)\n", | |
"val csvFile = s\"${symbol}.csv\"\n", | |
"fileWrite(csvFile, csv)\n", | |
"println(csvFile)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Highest Prices\n", | |
"\n", | |
"Find the days with the highest adjusted close prices." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(130.67132,2015-05-22)\n", | |
"(130.579411,2015-02-23)\n", | |
"(130.235775,2015-04-27)\n", | |
"(130.20796,2015-07-20)\n", | |
"(130.178369,2015-05-27)\n" | |
] | |
} | |
], | |
"source": [ | |
"val stockRdd = sc.textFile(csvFile).\n", | |
" filter(line => line matches \".*\\\\d.*\").\n", | |
" map(line => line.split(\",\")).\n", | |
" map(fields => (fields(6).toDouble,fields(0))).\n", | |
" sortBy({case (close,date) => close},false)\n", | |
"\n", | |
"stockRdd.take(5).foreach(println)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Load CSV\n", | |
"\n", | |
"Now lets use SQL to analyze the stock instead of directly manipulating records." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Load CSV file as data frame." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"val df = sqlContext.read.\n", | |
" format(\"com.databricks.spark.csv\").\n", | |
" option(\"header\", \"true\").\n", | |
" option(\"inferSchema\", \"true\").\n", | |
" load(\"AAPL.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### View Data Frame\n", | |
"\n", | |
"What does `df` look like?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+----------+\n", | |
"| Date| Adj Close|\n", | |
"+----------+----------+\n", | |
"|2016-03-21|105.910004|\n", | |
"|2016-03-18|105.919998|\n", | |
"|2016-03-17|105.800003|\n", | |
"|2016-03-16|105.970001|\n", | |
"|2016-03-15|104.580002|\n", | |
"|2016-03-14|102.519997|\n", | |
"|2016-03-11|102.260002|\n", | |
"|2016-03-10|101.169998|\n", | |
"|2016-03-09|101.120003|\n", | |
"|2016-03-08|101.029999|\n", | |
"|2016-03-07|101.870003|\n", | |
"|2016-03-04|103.010002|\n", | |
"|2016-03-03| 101.5|\n", | |
"|2016-03-02| 100.75|\n", | |
"|2016-03-01|100.529999|\n", | |
"|2016-02-29| 96.690002|\n", | |
"|2016-02-26| 96.910004|\n", | |
"|2016-02-25| 96.760002|\n", | |
"|2016-02-24| 96.099998|\n", | |
"|2016-02-23| 94.690002|\n", | |
"+----------+----------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df.select(\"Date\",\"Adj Close\").show" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### SQL Queries\n", | |
"\n", | |
"Register it as a SQL table." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df.registerTempTable(\"aapl\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Find out how many rows it has." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+---------+\n", | |
"|row_count|\n", | |
"+---------+\n", | |
"| 8894|\n", | |
"+---------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"sqlContext.sql(\"SELECT COUNT(1) AS row_count FROM aapl\").show" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Highest Prices\n", | |
"\n", | |
"Find out what the highest adjusted close was." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+---------+\n", | |
"|max_close|\n", | |
"+---------+\n", | |
"|130.67132|\n", | |
"+---------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"sqlContext.sql(\"SELECT MAX(`Adj Close`) AS max_close FROM aapl\").show" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Find the dates of the 5 highest adjusted close prices." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"attributes": { | |
"classes": [ | |
"scala" | |
], | |
"id": "" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+----------+\n", | |
"| Date| Adj Close|\n", | |
"+----------+----------+\n", | |
"|2015-05-22| 130.67132|\n", | |
"|2015-02-23|130.579411|\n", | |
"|2015-04-27|130.235775|\n", | |
"|2015-07-20| 130.20796|\n", | |
"|2015-05-27|130.178369|\n", | |
"+----------+----------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"sqlContext.sql(\"\"\"SELECT Date,`Adj Close` FROM aapl \n", | |
" ORDER BY `Adj Close` DESC LIMIT 5\"\"\").show" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Toree", | |
"language": "", | |
"name": "toree" | |
}, | |
"language_info": { | |
"name": "scala" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment