Skip to content

Instantly share code, notes, and snippets.

@granturing
Last active September 26, 2016 14:33
Show Gist options
  • Save granturing/29e0206277371b08d8b1 to your computer and use it in GitHub Desktop.
Save granturing/29e0206277371b08d8b1 to your computer and use it in GitHub Desktop.
DC Apache Spark MeetUp - Zeppelin & Spark SQL
{
"paragraphs": [
{
"text": "%md\n## Agenda\n\n1. Loading data, initial discovery\n2. Joining with browsers lookup\n3. Custom UDF for ip-geo\n4. Page dwell time, sessionize visits\n6. Revenue metrics\n7. Funnel analysis",
"dateUpdated": "Oct 6, 2015 2:38:20 AM",
"config": {
"colWidth": 8.0,
"editorMode": "ace/mode/markdown",
"editorHide": true,
"tableHide": false,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1441850383923_-1825399614",
"id": "20150910-015943_2026954340",
"result": {
"code": "SUCCESS",
"type": "HTML",
"msg": "\u003ch2\u003eAgenda\u003c/h2\u003e\n\u003col\u003e\n\u003cli\u003eLoading data, initial discovery\u003c/li\u003e\n\u003cli\u003eJoining with browsers lookup\u003c/li\u003e\n\u003cli\u003eCustom UDF for ip-geo\u003c/li\u003e\n\u003cli\u003ePage dwell time, sessionize visits\u003c/li\u003e\n\u003cli\u003eRevenue metrics\u003c/li\u003e\n\u003cli\u003eFunnel analysis\u003c/li\u003e\n\u003c/ol\u003e\n"
},
"dateCreated": "Sep 10, 2015 1:59:43 AM",
"dateStarted": "Oct 6, 2015 2:38:20 AM",
"dateFinished": "Oct 6, 2015 2:38:20 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "%md\n\n![logo](http://spark.apache.org/images/spark-logo.png)\n\n\u003cbr\u003e\n\u003cbr\u003e",
"dateUpdated": "Oct 5, 2015 2:16:33 PM",
"config": {
"colWidth": 4.0,
"editorMode": "ace/mode/markdown",
"editorHide": true,
"tableHide": false,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442668067511_15728714",
"id": "20150919-130747_286072817",
"result": {
"code": "SUCCESS",
"type": "HTML",
"msg": "\u003cp\u003e\u003cimg src\u003d\"http://spark.apache.org/images/spark-logo.png\" alt\u003d\"logo\" /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cbr\u003e\n\u003cbr /\u003e\u003cbr\u003e\u003c/p\u003e\n"
},
"dateCreated": "Sep 19, 2015 1:07:47 PM",
"dateStarted": "Sep 22, 2015 2:33:40 AM",
"dateFinished": "Sep 22, 2015 2:33:40 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Dependencies",
"text": "%dep\n\n// This needs to be run before any other Spark commands to ensure our dependencies are available\n\nz.load(\"com.databricks:spark-csv_2.10:1.2.0\")\nz.load(\"com.maxmind.geoip2:geoip2:2.3.1\")",
"dateUpdated": "Oct 5, 2015 11:45:57 PM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1441827572917_665058106",
"id": "20150909-193932_1454078610",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "res0: org.apache.zeppelin.spark.dep.Dependency \u003d org.apache.zeppelin.spark.dep.Dependency@6662d4b0\n"
},
"dateCreated": "Sep 9, 2015 7:39:32 PM",
"dateStarted": "Oct 5, 2015 11:45:57 PM",
"dateFinished": "Oct 5, 2015 11:46:04 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Load Datasets",
"text": "// using the SparkCSV package for easy loading of CSV and TSV files\n\nval events \u003d sqlContext.read.format(\"com.databricks.spark.csv\").\n options(Map(\"header\"-\u003e\"true\", \"inferSchema\" -\u003e \"true\")).\n load(\"/data/meetup/events.csv\")\n\nval browsers \u003d sqlContext.read.format(\"com.databricks.spark.csv\").\n options(Map(\"header\"-\u003e\"true\", \"delimiter\" -\u003e \"\\t\", \"inferSchema\" -\u003e \"true\")).\n load(\"/data/meetup/browser.tsv\")\n\n\n\n\n",
"dateUpdated": "Oct 6, 2015 12:01:28 AM",
"config": {
"tableHide": false,
"colWidth": 8.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1441898835007_1628572669",
"id": "20150910-152715_620844420",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "events: org.apache.spark.sql.DataFrame \u003d [hit_time: string, visid: double, hitid: double, ip: string, url: string, page_name: string, referrer: string, browser: int, product: string, revenue: double, orders: int, units: int, ecom_event: string]\nbrowsers: org.apache.spark.sql.DataFrame \u003d [id: bigint, name: string]\n"
},
"dateCreated": "Sep 10, 2015 3:27:15 PM",
"dateStarted": "Oct 5, 2015 11:46:06 PM",
"dateFinished": "Oct 5, 2015 11:46:42 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "events.printSchema",
"dateUpdated": "Oct 6, 2015 12:01:22 AM",
"config": {
"colWidth": 4.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"tableHide": false,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442543440508_788333577",
"id": "20150918-023040_1513861481",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "root\n |-- hit_time: string (nullable \u003d true)\n |-- visid: double (nullable \u003d true)\n |-- hitid: double (nullable \u003d true)\n |-- ip: string (nullable \u003d true)\n |-- url: string (nullable \u003d true)\n |-- page_name: string (nullable \u003d true)\n |-- referrer: string (nullable \u003d true)\n |-- browser: integer (nullable \u003d true)\n |-- product: string (nullable \u003d true)\n |-- revenue: double (nullable \u003d true)\n |-- orders: integer (nullable \u003d true)\n |-- units: integer (nullable \u003d true)\n |-- ecom_event: string (nullable \u003d true)\n\n"
},
"dateCreated": "Sep 18, 2015 2:30:40 AM",
"dateStarted": "Oct 6, 2015 12:01:22 AM",
"dateFinished": "Oct 6, 2015 12:01:23 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Sample Data",
"text": "// in Zeppelin we can call \"z.show\" to display our dataset, for DataFrames it will provide some basic visualizations such as a table and various charts\n\nz.show(events.sort(\"hit_time\"))",
"dateUpdated": "Oct 6, 2015 2:40:54 AM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442807464951_352413554",
"id": "20150921-035104_470687314",
"result": {
"code": "SUCCESS",
"type": "TABLE",
"msg": ""
},
"dateCreated": "Sep 21, 2015 3:51:04 AM",
"dateStarted": "Sep 22, 2015 7:45:01 PM",
"dateFinished": "Sep 22, 2015 7:45:06 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Stats for Numeric Values",
"text": "// the describe method calulates some basic statistics of numerical columns\n\nz.show(events.where(\"revenue \u003e 0\").\n describe(\"revenue\", \"orders\", \"units\"))",
"dateUpdated": "Oct 6, 2015 12:03:26 AM",
"config": {
"tableHide": false,
"colWidth": 6.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 188.0,
"optionOpen": false,
"keys": [
{
"name": "summary",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "revenue",
"index": 1.0,
"aggr": "sum"
},
{
"name": "orders",
"index": 2.0,
"aggr": "sum"
},
{
"name": "units",
"index": 3.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"size": {
"name": "summary",
"index": 0.0,
"aggr": "sum"
},
"xAxis": {
"name": "summary",
"index": 0.0,
"aggr": "sum"
},
"yAxis": {
"name": "revenue",
"index": 1.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442541900263_398204246",
"id": "20150918-020500_1027466035",
"result": {
"code": "SUCCESS",
"type": "TABLE",
"msg": "summary\trevenue\torders\tunits\ncount\t812\t812\t812\nmean\t106.36847290640402\t1.0\t19.852216748768473\nstddev\t139.43563080879747\t0.0\t8.318051621068122\nmin\t1.1\t1\t10\nmax\t800.0\t1\t30\n"
},
"dateCreated": "Sep 18, 2015 2:05:00 AM",
"dateStarted": "Oct 5, 2015 2:19:00 PM",
"dateFinished": "Oct 5, 2015 2:19:06 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Frequent Items in Products",
"text": "z.show(events.stat.freqItems(Seq(\"product\"), 0.1).\n select(explode(column(\"product_freqItems\")).as(\"product\")))\n\n\n",
"dateUpdated": "Oct 6, 2015 12:03:26 AM",
"config": {
"tableHide": false,
"colWidth": 6.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 174.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442370784469_240186565",
"id": "20150916-023304_356313631",
"result": {
"code": "SUCCESS",
"type": "TABLE",
"msg": "product\njeans\nskinny jean\nshorts\ntennis shorts\ncotton pants\ncufflinks\nlow waist jean\n\n"
},
"dateCreated": "Sep 16, 2015 2:33:04 AM",
"dateStarted": "Oct 5, 2015 2:19:11 PM",
"dateFinished": "Oct 5, 2015 2:19:13 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Join with Browsers Lookup Table",
"text": "val withBrowser \u003d events.\n join(browsers, \n events(\"browser\") \u003d\u003d\u003d browsers(\"id\"), \n \"left_outer\").\n drop(\"id\").\n drop(\"browser\").\n withColumnRenamed(\"name\", \"browser\").\n na.fill(\"Unknown\", Seq(\"browser\"))",
"dateUpdated": "Oct 6, 2015 12:04:05 AM",
"config": {
"tableHide": true,
"colWidth": 8.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442370883309_-3556306",
"id": "20150916-023443_1970195425",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "withBrowser: org.apache.spark.sql.DataFrame \u003d [hit_time: string, visid: double, hitid: double, ip: string, url: string, page_name: string, referrer: string, product: string, revenue: double, orders: int, units: int, ecom_event: string, browser: string]\n"
},
"dateCreated": "Sep 16, 2015 2:34:43 AM",
"dateStarted": "Oct 5, 2015 11:46:53 PM",
"dateFinished": "Oct 5, 2015 11:46:54 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "%md\n\u003cbr\u003e\nIf we wanted to perform a broadcast join we\u0027d first have to cache the browsers dataset or we can use a broadcast \"hint\" by writing the join like so:\n\u003cbr\u003e\n```scala\nval withBrowser \u003d events.\n join(broadcast(browsers),\n events(\"browser\") \u003d\u003d\u003d browsers(\"id\"),\n \"left_outer\")\n```",
"dateUpdated": "Oct 6, 2015 2:34:27 AM",
"config": {
"colWidth": 4.0,
"editorMode": "ace/mode/markdown",
"editorHide": true,
"tableHide": false,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442544848815_1426667584",
"id": "20150918-025408_940872422",
"result": {
"code": "SUCCESS",
"type": "HTML",
"msg": "\u003cp\u003e\u003cbr\u003e\n\u003cbr /\u003eIf we wanted to perform a broadcast join we\u0027d first have to cache the browsers dataset or we can use a broadcast \u0026ldquo;hint\u0026rdquo; by writing the join like so:\n\u003cbr /\u003e\u003cbr\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode class\u003d\"scala\"\u003eval withBrowser \u003d events.\n join(broadcast(browsers),\n events(\"browser\") \u003d\u003d\u003d browsers(\"id\"),\n \"left_outer\")\n\u003c/code\u003e\u003c/pre\u003e\n"
},
"dateCreated": "Sep 18, 2015 2:54:08 AM",
"dateStarted": "Oct 6, 2015 2:34:25 AM",
"dateFinished": "Oct 6, 2015 2:34:25 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Most Used Browsers",
"text": "val visitorsByBrowser \u003d withBrowser.\n groupBy(\"browser\").\n agg(countDistinct(\"visid\").as(\"count\")).\n sort(column(\"count\").desc)\n\nz.show(visitorsByBrowser)",
"dateUpdated": "Oct 6, 2015 2:34:43 AM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [
{
"name": "browser",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "count",
"index": 1.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"xAxis": {
"name": "browser",
"index": 0.0,
"aggr": "sum"
},
"yAxis": {
"name": "count",
"index": 1.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442544779969_1480624375",
"id": "20150918-025259_279579487",
"result": {
"code": "SUCCESS",
"type": "TABLE",
"msg": "browser\tcount\nMicrosoft Internet Explorer 6.0\t2715\nUnknown\t1894\nMicrosoft Internet Explorer 5.5\t176\nMicrosoft MSN Internet Access\t79\nMozilla Firefox 3.0\t54\nGoogle Chrome (unknown version)\t48\nMicrosoft Internet Explorer 5.0\t33\nMicrosoft Internet Explorer 5.01\t30\nMicrosoft Internet Explorer 6.0 (AOL)\t30\nMicrosoft MSN Explorer 2.5\t25\nSafari 1.0.3\t22\nSafari 1.2.4\t16\nSafari 1.3\t12\nMicrosoft Internet Explorer 5.16 (Macintosh)\t7\nSafari 1.1.1\t6\nMicrosoft Internet Explorer 5.23 (Macintosh)\t6\nMicrosoft Internet Explorer 4.01\t6\nSafari 2.0\t6\nSafari 1.2.3\t6\nSafari 1.3.1\t5\nMicrosoft Internet Explorer 5.22 (Macintosh)\t5\nOpera 7.54\t5\nMicrosoft Internet Explorer (unknown version)\t4\nMicrosoft Internet Explorer 5.17 (Macintosh)\t4\nMicrosoft Internet Explorer 5.14 (Macintosh)\t3\nSafari 1.3.2\t2\nSafari 2.0.4\t2\nMozilla Firefox 1.5 beta\t1\nSafari 2.0.3\t1\nMozilla Firefox 1.0.5\t1\nSafari 2.0.2\t1\nMozilla Firefox (unknown version)\t1\nAmerica Online Browser (unknown version)\t1\n"
},
"dateCreated": "Sep 18, 2015 2:52:59 AM",
"dateStarted": "Oct 5, 2015 2:24:30 PM",
"dateFinished": "Oct 5, 2015 2:24:35 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Geo-IP Lookup UDF First Try",
"text": "import org.apache.spark.sql.functions._\nimport java.io.File\nimport java.net.InetAddress\nimport com.maxmind.geoip2._\n\ncase class Location(latitude: Double, longitude: Double)\ncase class Geo(city: String, country: String, loc: Location)\n\n// this is not very good since it\u0027ll re-load the DB file each execution of the UDF\nval iplookupv1 \u003d udf { (s: String) \u003d\u003e {\n val ip \u003d InetAddress.getByName(s)\n \n val db \u003d new File(\"/data/meetup/GeoLite2-City.mmdb\")\n \n val reader \u003d new DatabaseReader.Builder(db).build()\n \n val record \u003d GeoIPLookup.reader.city(ip)\n \n val city \u003d record.getCity\n val country \u003d record.getCountry\n val location \u003d record.getLocation\n \n Geo(city.getName, country.getName, Location(location.getLatitude, location.getLongitude))\n} }",
"dateUpdated": "Oct 6, 2015 12:05:16 AM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442672133114_1131305646",
"id": "20150919-141533_912317917",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "import org.apache.spark.sql.functions._\nimport java.io.File\nimport java.net.InetAddress\nimport com.maxmind.geoip2._\ndefined class Location\ndefined class Geo\niplookupv1: org.apache.spark.sql.UserDefinedFunction \u003d UserDefinedFunction(\u003cfunction1\u003e,StructType(StructField(city,StringType,true), StructField(country,StringType,true), StructField(loc,StructType(StructField(latitude,DoubleType,false), StructField(longitude,DoubleType,false)),true)),List(StringType))\n"
},
"dateCreated": "Sep 19, 2015 2:15:33 PM",
"dateStarted": "Sep 19, 2015 2:17:12 PM",
"dateFinished": "Sep 19, 2015 2:17:17 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Geo-IP lookup UDF Better",
"text": "import org.apache.spark.sql.functions._\nimport java.io.File\nimport java.net.InetAddress\nimport com.maxmind.geoip2._\n\n// rather than load the DB file each call we use a Scala object with a lazy initializer (Java singleton)\n// this will get initialized once in each executor JVM, the @transient attribute tells Scala not to serialize the field\n\nobject GeoIPLookup {\n @transient lazy val reader \u003d {\n val db \u003d new File(\"/data/meetup/GeoLite2-City.mmdb\")\n \n val reader \u003d new DatabaseReader.Builder(db).build()\n \n reader\n }\n}\n\ncase class Location(latitude: Double, longitude: Double)\ncase class Geo(city: String, country: String, loc: Location)\n\nval iplookup \u003d udf { (s: String) \u003d\u003e {\n val ip \u003d InetAddress.getByName(s)\n \n val record \u003d GeoIPLookup.reader.city(ip)\n \n val city \u003d record.getCity\n val country \u003d record.getCountry\n val location \u003d record.getLocation\n \n Geo(city.getName, country.getName, Location(location.getLatitude, location.getLongitude))\n} }",
"dateUpdated": "Oct 6, 2015 12:05:42 AM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442545858306_-1557633185",
"id": "20150918-031058_1813076048",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "import org.apache.spark.sql.functions._\nimport java.io.File\nimport java.net.InetAddress\nimport com.maxmind.geoip2._\ndefined module GeoIPLookup\ndefined class Location\ndefined class Geo\niplookup: org.apache.spark.sql.UserDefinedFunction \u003d UserDefinedFunction(\u003cfunction1\u003e,StructType(StructField(city,StringType,true), StructField(country,StringType,true), StructField(loc,StructType(StructField(latitude,DoubleType,false), StructField(longitude,DoubleType,false)),true)),List(StringType))\n"
},
"dateCreated": "Sep 18, 2015 3:10:58 AM",
"dateStarted": "Oct 5, 2015 11:47:01 PM",
"dateFinished": "Oct 5, 2015 11:47:06 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Run Geo-IP Lookup",
"text": "val withGeo \u003d withBrowser.\n withColumn(\"geo\", iplookup(column(\"ip\")))\n\nwithGeo.printSchema",
"dateUpdated": "Oct 6, 2015 2:36:21 AM",
"config": {
"tableHide": false,
"colWidth": 6.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [
{
"name": "ip",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "city",
"index": 1.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"xAxis": {
"name": "ip",
"index": 0.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442670110426_662417162",
"id": "20150919-134150_886441783",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "withGeo: org.apache.spark.sql.DataFrame \u003d [hit_time: string, visid: double, hitid: double, ip: string, url: string, page_name: string, referrer: string, product: string, revenue: double, orders: int, units: int, ecom_event: string, browser: string, geo: struct\u003ccity:string,country:string,loc:struct\u003clatitude:double,longitude:double\u003e\u003e]\nroot\n |-- hit_time: string (nullable \u003d true)\n |-- visid: double (nullable \u003d true)\n |-- hitid: double (nullable \u003d true)\n |-- ip: string (nullable \u003d true)\n |-- url: string (nullable \u003d true)\n |-- page_name: string (nullable \u003d true)\n |-- referrer: string (nullable \u003d true)\n |-- product: string (nullable \u003d true)\n |-- revenue: double (nullable \u003d true)\n |-- orders: integer (nullable \u003d true)\n |-- units: integer (nullable \u003d true)\n |-- ecom_event: string (nullable \u003d true)\n |-- browser: string (nullable \u003d false)\n |-- geo: struct (nullable \u003d true)\n | |-- city: string (nullable \u003d true)\n | |-- country: string (nullable \u003d true)\n | |-- loc: struct (nullable \u003d true)\n | | |-- latitude: double (nullable \u003d false)\n | | |-- longitude: double (nullable \u003d false)\n\n"
},
"dateCreated": "Sep 19, 2015 1:41:50 PM",
"dateStarted": "Oct 5, 2015 11:47:12 PM",
"dateFinished": "Oct 5, 2015 11:47:13 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Query Revenue By Location",
"text": "// this is just showing how we can use Zeppelin to provide filtering of our Spark\n// queries from the UI\n\ncase class Loc(lat: Double, lon: Double, desc: String)\n\nval argLimit \u003d z.input(\"Limit\", 50).toString.toInt\nval country \u003d z.select(\"Country\", \"United States\",\n Seq((\"United States\", \"United States\"), \n (\"Italy\", \"Italy\"), \n (\"England\", \"England\"), \n (\"Germany\", \"Germany\"), \n (\"France\", \"France\"),\n (\"*\", \"All\"))\n).toString\n\nval countryFilter \u003d country match {\n case \"*\" \u003d\u003e \"true \u003d true\"\n case _ \u003d\u003e s\"geo.country \u003d \u0027$country\u0027\"\n }\n\nval locations \u003d withGeo.\n where(countryFilter).\n where(\"geo.city is not null\").\n groupBy(\"geo\").\n agg(sum(\"revenue\").as(\"revenue\")).\n where(\"revenue \u003e 0\").\n select(\"geo.loc.latitude\", \"geo.loc.longitude\", \"geo.city\", \"revenue\").\n sort(column(\"revenue\").desc).\n limit(argLimit)\n \nval fmt \u003d java.text.NumberFormat.getCurrencyInstance(java.util.Locale.US)\n\nval result \u003d locations.collect().\n map(i \u003d\u003e {\n Loc(\n i.getDouble(0),\n i.getDouble(1),\n i.getString(2) + \": \" + fmt.format(i.getDouble(3))\n )\n }).toArray\n\nz.angularBind(\"locations\", result)\n\nprintln(s\"%html\") //just a hack to disable Scala output",
"dateUpdated": "Oct 6, 2015 2:36:03 AM",
"config": {
"tableHide": false,
"colWidth": 6.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {
"Country": "United States",
"Limit": "20"
},
"forms": {
"Limit": {
"name": "Limit",
"displayName": "Limit",
"defaultValue": 50.0,
"hidden": false
},
"Country": {
"name": "Country",
"displayName": "Country",
"defaultValue": "",
"options": [
{
"value": "United States",
"displayName": "United States"
},
{
"value": "Italy",
"displayName": "Italy"
},
{
"value": "England",
"displayName": "England"
},
{
"value": "Germany",
"displayName": "Germany"
},
{
"value": "France",
"displayName": "France"
},
{
"value": "*",
"displayName": "All"
}
],
"hidden": false
}
}
},
"jobName": "paragraph_1442946053489_450492026",
"id": "20150922-182053_769430647",
"result": {
"code": "SUCCESS",
"type": "HTML",
"msg": ""
},
"dateCreated": "Sep 22, 2015 6:20:53 PM",
"dateStarted": "Oct 6, 2015 12:15:32 AM",
"dateFinished": "Oct 6, 2015 12:15:37 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Mapping Revenue By Location",
"text": "%angular\n\n\u003clink rel\u003d\"stylesheet\" href\u003d\"https://cdnjs.cloudflare.com/ajax/libs/leaflet/0.7.5/leaflet.css\" /\u003e\n\u003cdiv id\u003d\"map\" style\u003d\"height: 800px; width: 100%\"\u003e\u003c/div\u003e\n\n\u003cscript type\u003d\"text/javascript\"\u003e\nfunction initMap() {\n var map \u003d L.map(\u0027map\u0027).setView([30.00, -30.00], 3);\n\n L.tileLayer(\u0027http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png\u0027, {\n attribution: \u0027Map data \u0026copy; \u003ca href\u003d\"http://openstreetmap.org\"\u003eOpenStreetMap\u003c/a\u003e contributors\u0027,\n maxZoom: 12,\n minZoom: 3\n }).addTo(map);\n\n var geoMarkers \u003d L.layerGroup().addTo(map);\n \n var el \u003d angular.element($(\u0027#map\u0027).parent(\u0027.ng-scope\u0027));\n angular.element(el).ready(function() {\n window.locationWatcher \u003d el.scope().compiledScope.$watch(\u0027locations\u0027, function(newValue, oldValue) {\n geoMarkers.clearLayers();\n angular.forEach(newValue, function(loc) {\n var marker \u003d L.marker([ loc.lat, loc.lon ])\n .bindPopup(loc.desc)\n .addTo(geoMarkers);\n });\n })\n });\n}\n\nif (window.locationWatcher) {\n // clear existing watcher otherwise we\u0027ll have duplicates\n window.locationWatcher();\n}\n\n// ensure we only load the script once, seems to cause issues otherwise\nif (window.L) {\n initMap();\n} else {\n console.log(\u0027Loading Leaflet library\u0027);\n var sc \u003d document.createElement(\u0027script\u0027);\n sc.type \u003d \u0027text/javascript\u0027;\n sc.src \u003d \u0027https://cdnjs.cloudflare.com/ajax/libs/leaflet/0.7.5/leaflet.js\u0027;\n sc.onload \u003d initMap;\n sc.onerror \u003d function(err) { alert(err); }\n document.getElementsByTagName(\u0027head\u0027)[0].appendChild(sc);\n}\n\u003c/script\u003e",
"dateUpdated": "Oct 6, 2015 2:52:26 AM",
"config": {
"tableHide": false,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442945801831_-140449991",
"id": "20150922-181641_1025413571",
"result": {
"code": "SUCCESS",
"type": "ANGULAR",
"msg": "\n\u003clink rel\u003d\"stylesheet\" href\u003d\"https://cdnjs.cloudflare.com/ajax/libs/leaflet/0.7.5/leaflet.css\" /\u003e\n\u003cdiv id\u003d\"map\" style\u003d\"height: 800px; width: 100%\"\u003e\u003c/div\u003e\n\n\u003cscript type\u003d\"text/javascript\"\u003e\nfunction initMap() {\n var map \u003d L.map(\u0027map\u0027).setView([30.00, -30.00], 3);\n\n L.tileLayer(\u0027http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png\u0027, {\n attribution: \u0027Map data \u0026copy; \u003ca href\u003d\"http://openstreetmap.org\"\u003eOpenStreetMap\u003c/a\u003e contributors\u0027,\n maxZoom: 12,\n minZoom: 3\n }).addTo(map);\n\n var geoMarkers \u003d L.layerGroup().addTo(map);\n \n var el \u003d angular.element($(\u0027#map\u0027).parent(\u0027.ng-scope\u0027));\n angular.element(el).ready(function() {\n window.locationWatcher \u003d el.scope().compiledScope.$watch(\u0027locations\u0027, function(newValue, oldValue) {\n geoMarkers.clearLayers();\n angular.forEach(newValue, function(loc) {\n var marker \u003d L.marker([ loc.lat, loc.lon ])\n .bindPopup(loc.desc)\n .addTo(geoMarkers);\n });\n })\n });\n}\n\nif (window.locationWatcher) {\n // clear existing watcher otherwise we\u0027ll have duplicates\n window.locationWatcher();\n}\n\n// ensure we only load the script once, seems to cause issues otherwise\nif (window.L) {\n initMap();\n} else {\n console.log(\u0027Loading Leaflet library\u0027);\n var sc \u003d document.createElement(\u0027script\u0027);\n sc.type \u003d \u0027text/javascript\u0027;\n sc.src \u003d \u0027https://cdnjs.cloudflare.com/ajax/libs/leaflet/0.7.5/leaflet.js\u0027;\n sc.onload \u003d initMap;\n sc.onerror \u003d function(err) { alert(err); }\n document.getElementsByTagName(\u0027head\u0027)[0].appendChild(sc);\n}\n\u003c/script\u003e"
},
"dateCreated": "Sep 22, 2015 6:16:41 PM",
"dateStarted": "Oct 6, 2015 2:52:26 AM",
"dateFinished": "Oct 6, 2015 2:52:26 AM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"text": "%md\n# Analytics with Window Functions\n\n* ### Introduced in v1.4\n* ### Similar to Hive Windowing Functions\n* ### Allows for aggregate operations on sliding window of rows\n* ### A Window consists of:\n* * #### A partition (grouping of rows)\n* * #### A sort order within the partition\n* * #### A range for the window, based on number of rows or range of order values\n* ###[Databricks Blog: Introducing Window Functions in Spark SQL](https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html)\n\n\u003cbr/\u003e\n\n![rows](https://databricks.com/wp-content/uploads/2015/07/2-1-1024x338.png)\n\n\u003cbr/\u003e\n\n![rows](https://databricks.com/wp-content/uploads/2015/07/2-2-1024x369.png)\n\n",
"dateUpdated": "Oct 5, 2015 2:36:53 PM",
"config": {
"colWidth": 12.0,
"editorMode": "ace/mode/markdown",
"editorHide": true,
"tableHide": false,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442858582864_-1870987546",
"id": "20150921-180302_1740311258",
"result": {
"code": "SUCCESS",
"type": "HTML",
"msg": "\u003ch1\u003eAnalytics with Window Functions\u003c/h1\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ch3\u003eIntroduced in v1.4\u003c/h3\u003e\n\u003c/li\u003e\n\u003cli\u003e\u003ch3\u003eSimilar to Hive Windowing Functions\u003c/h3\u003e\n\u003c/li\u003e\n\u003cli\u003e\u003ch3\u003eAllows for aggregate operations on sliding window of rows\u003c/h3\u003e\n\u003c/li\u003e\n\u003cli\u003e\u003ch3\u003eA Window consists of:\u003c/h3\u003e\n\u003c/li\u003e\n\u003cli\u003e\u003cul\u003e\n\u003cli\u003e\u003ch4\u003eA partition (grouping of rows)\u003c/h4\u003e\n\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/li\u003e\n\u003cli\u003e\u003cul\u003e\n\u003cli\u003e\u003ch4\u003eA sort order within the partition\u003c/h4\u003e\n\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/li\u003e\n\u003cli\u003e\u003cul\u003e\n\u003cli\u003e\u003ch4\u003eA range for the window, based on number of rows or range of order values\u003c/h4\u003e\n\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/li\u003e\n\u003cli\u003e\u003ch3\u003e\u003ca href\u003d\"https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html\"\u003eDatabricks Blog: Introducing Window Functions in Spark SQL\u003c/a\u003e\u003c/h3\u003e\n\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003e\u003cbr /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cimg src\u003d\"https://databricks.com/wp-content/uploads/2015/07/2-1-1024x338.png\" alt\u003d\"rows\" /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cbr /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cimg src\u003d\"https://databricks.com/wp-content/uploads/2015/07/2-2-1024x369.png\" alt\u003d\"rows\" /\u003e\u003c/p\u003e\n"
},
"dateCreated": "Sep 21, 2015 6:03:02 PM",
"dateStarted": "Sep 21, 2015 6:21:40 PM",
"dateFinished": "Sep 21, 2015 6:21:40 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Measure Page Dwell time",
"text": "import org.apache.spark.sql.expressions.Window\n\nval windowSpec \u003d Window.\n partitionBy(\"visid\").\n orderBy(column(\"hit_time\").asc)\n\nval tsFormat \u003d \"YYYY-MM-dd HH:mm:ss\"\n\nval withDwellTime \u003d withGeo.\n withColumn(\"dwell_time\", \n unix_timestamp(column(\"hit_time\"), tsFormat) - \n unix_timestamp(lag(\"hit_time\", 1, 0l).over(windowSpec), tsFormat)).\n na.fill(0l, Seq(\"dwell_time\"))",
"dateUpdated": "Oct 6, 2015 2:37:32 AM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [
{
"name": "hit_time",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "visid",
"index": 1.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"xAxis": {
"name": "hit_time",
"index": 0.0,
"aggr": "sum"
},
"yAxis": {
"name": "visid",
"index": 1.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442761506960_528927024",
"id": "20150920-150506_450791508",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "import org.apache.spark.sql.expressions.Window\nwindowSpec: org.apache.spark.sql.expressions.WindowSpec \u003d org.apache.spark.sql.expressions.WindowSpec@4b9d1c66\ntsFormat: String \u003d YYYY-MM-dd HH:mm:ss\nwithDwellTime: org.apache.spark.sql.DataFrame \u003d [hit_time: string, visid: double, hitid: double, ip: string, url: string, page_name: string, referrer: string, product: string, revenue: double, orders: int, units: int, ecom_event: string, browser: string, geo: struct\u003ccity:string,country:string,loc:struct\u003clatitude:double,longitude:double\u003e\u003e, dwell_time: bigint]\n"
},
"dateCreated": "Sep 20, 2015 3:05:06 PM",
"dateStarted": "Oct 5, 2015 11:49:29 PM",
"dateFinished": "Oct 5, 2015 11:49:31 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Sessionize Visits using Dwell Time",
"text": "val sessionized \u003d withDwellTime.\n withColumn(\"newVisit\", when(column(\"dwell_time\") \u003e 15*60, 1).otherwise(0)).\n withColumn(\"visitnum\", sum(\"newVisit\").over(windowSpec)).\n withColumn(\"visitnum\", column(\"visitnum\") + 1).\n drop(\"newVisit\")",
"dateUpdated": "Oct 5, 2015 11:57:57 PM",
"config": {
"tableHide": false,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": false,
"keys": [
{
"name": "visid",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "hit_time",
"index": 1.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"xAxis": {
"name": "visid",
"index": 0.0,
"aggr": "sum"
},
"yAxis": {
"name": "hit_time",
"index": 1.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442763072437_-294213982",
"id": "20150920-153112_2140102228",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "sessionized: org.apache.spark.sql.DataFrame \u003d [hit_time: string, visid: double, hitid: double, ip: string, url: string, page_name: string, referrer: string, product: string, revenue: double, orders: int, units: int, ecom_event: string, browser: string, geo: struct\u003ccity:string,country:string,loc:struct\u003clatitude:double,longitude:double\u003e\u003e, dwell_time: bigint, visitnum: bigint]\n"
},
"dateCreated": "Sep 20, 2015 3:31:12 PM",
"dateStarted": "Oct 5, 2015 11:57:57 PM",
"dateFinished": "Oct 5, 2015 11:57:58 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Top External Referrers by Revenue",
"text": "// another example UDF to extract the hostname from a URL\n\nval host \u003d udf { (s: String) \u003d\u003e s.length \u003e 0 match {\n case true \u003d\u003e {\n try {\n val url \u003d new java.net.URL(s)\n url.getHost\n } catch {\n case e: java.net.MalformedURLException \u003d\u003e \"\"\n }\n }\n case _ \u003d\u003e \"\"\n} }\n\n// we want to find referrers per session, so the window will be partitioned by visid and visitnum\n// and ordered by hit_time\n\nval sessionWindowSpec \u003d Window.\n partitionBy(\"visid\", \"visitnum\").\n orderBy(\"hit_time\")\n \nval externalReferrers \u003d sessionized.\n withColumn(\"session_referrer\", host(first(\"referrer\").over(sessionWindowSpec))).\n where(\"session_referrer not in (\u0027www.granturing.com\u0027, \u0027\u0027)\")\n\nval revenueByReferrer \u003d externalReferrers.\n groupBy(\"session_referrer\").\n agg(sum(\"revenue\").as(\"total_revenue\")).\n sort(desc(\"total_revenue\")).\n limit(10)\n\nz.show(revenueByReferrer)",
"dateUpdated": "Oct 6, 2015 2:37:48 AM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "multiBarChart",
"height": 300.0,
"optionOpen": false,
"keys": [
{
"name": "session_referrer",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "total_revenue",
"index": 1.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"xAxis": {
"name": "session_referrer",
"index": 0.0,
"aggr": "sum"
},
"yAxis": {
"name": "total_revenue",
"index": 1.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442775134624_560795631",
"id": "20150920-185214_518066135",
"result": {
"code": "SUCCESS",
"type": "TABLE",
"msg": ""
},
"dateCreated": "Sep 20, 2015 6:52:14 PM",
"dateStarted": "Oct 5, 2015 11:58:02 PM",
"dateFinished": "Oct 5, 2015 11:58:08 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Top Revenue By Landing Page",
"text": "// the landing page is the first page in the session but a purchase will occur at the end of the session, so\n// we use window functions to keep track of the landing page across the whole session\n\nval landingPageRevenue \u003d sessionized.\n withColumn(\"landing_page\", first(\"page_name\").over(sessionWindowSpec)).\n groupBy(\"landing_page\").\n agg(sum(\"revenue\").as(\"revenue\")).\n sort(column(\"revenue\").desc).\n limit(10)\n\nz.show(landingPageRevenue)",
"dateUpdated": "Oct 6, 2015 2:37:57 AM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "multiBarChart",
"height": 300.0,
"optionOpen": false,
"keys": [],
"values": [],
"groups": [],
"scatter": {}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442807106229_-876411492",
"id": "20150921-034506_638321073",
"result": {
"code": "SUCCESS",
"type": "TABLE",
"msg": ""
},
"dateCreated": "Sep 21, 2015 3:45:06 AM",
"dateStarted": "Oct 5, 2015 11:58:15 PM",
"dateFinished": "Oct 5, 2015 11:58:21 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Top 2 Purchased Items By Cities in US",
"text": "// using the rank window function to rank top 2 products per city in the US\n\nval geoWindowSpec \u003d Window.\n partitionBy(\"geo.city\", \"geo.country\").\n orderBy(column(\"revenue\").desc)\n \nval topProductsByCity \u003d sessionized.\n where(\"product \u003c\u003e \u0027\u0027\").\n where(\"revenue \u003e 0\").\n where(\"geo.city is not null\").\n where(\"geo.country \u003d \u0027United States\u0027\").\n select(\n column(\"geo.city\"),\n column(\"geo.country\"),\n column(\"product\"),\n column(\"revenue\"),\n rank.over(geoWindowSpec).as(\"rank\")\n ).\n where(\"rank \u003c\u003d 2\")\n\nz.show(topProductsByCity)",
"dateUpdated": "Oct 6, 2015 2:38:02 AM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "table",
"height": 300.0,
"optionOpen": true,
"keys": [
{
"name": "city",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "country",
"index": 1.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"xAxis": {
"name": "city",
"index": 0.0,
"aggr": "sum"
},
"yAxis": {
"name": "country",
"index": 1.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442776030410_-756692993",
"id": "20150920-190710_1327070312",
"result": {
"code": "SUCCESS",
"type": "TABLE",
"msg": ""
},
"dateCreated": "Sep 20, 2015 7:07:10 PM",
"dateStarted": "Oct 5, 2015 11:58:35 PM",
"dateFinished": "Oct 5, 2015 11:58:41 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Funnel Analysis",
"text": "// this is just setting a flag (0,1) for each event of a funnel\nval funnel \u003d sessionized.\n withColumn(\"viewed_product\", when(column(\"ecom_event\") \u003d\u003d\u003d \"VIEW_PRODUCT\", 1).otherwise(0)).\n withColumn(\"added_to_cart\", when(column(\"ecom_event\") \u003d\u003d\u003d \"ADD_CART\", 1).otherwise(0)).\n withColumn(\"checked_out\", when(column(\"ecom_event\") \u003d\u003d\u003d \"CHECKOUT\", 1).otherwise(0)).\n withColumn(\"completed_purchase\", when(column(\"page_name\") \u003d\u003d\u003d \"Order Completion\", 1).otherwise(0))\n",
"dateUpdated": "Oct 5, 2015 11:59:20 PM",
"config": {
"tableHide": true,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "multiBarChart",
"height": 300.0,
"optionOpen": false,
"keys": [
{
"name": "sum(view)",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "sum(view)",
"index": 0.0,
"aggr": "sum"
},
{
"name": "sum(cart)",
"index": 1.0,
"aggr": "sum"
},
{
"name": "sum(checkout)",
"index": 2.0,
"aggr": "sum"
},
{
"name": "sum(purchase)",
"index": 3.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"yAxis": {
"name": "sum(cart)",
"index": 1.0,
"aggr": "sum"
},
"xAxis": {
"name": "sum(view)",
"index": 0.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442796076656_-2037632492",
"id": "20150921-004116_254967394",
"result": {
"code": "SUCCESS",
"type": "TEXT",
"msg": "funnel: org.apache.spark.sql.DataFrame \u003d [hit_time: string, visid: double, hitid: double, ip: string, url: string, page_name: string, referrer: string, product: string, revenue: double, orders: int, units: int, ecom_event: string, browser: string, geo: struct\u003ccity:string,country:string,loc:struct\u003clatitude:double,longitude:double\u003e\u003e, dwell_time: bigint, visitnum: bigint, viewed_product: int, added_to_cart: int, checked_out: int, completed_purchase: int]\n"
},
"dateCreated": "Sep 21, 2015 12:41:16 AM",
"dateStarted": "Oct 5, 2015 11:59:20 PM",
"dateFinished": "Oct 5, 2015 11:59:20 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Funnel Chart",
"text": "// now we want to just get the flags for each event per session (session is visid and visitnum) then we sum them across all sessions\nval rollupFunnel \u003d funnel.\n groupBy(\"visid\", \"visitnum\").\n agg(\n max(\"viewed_product\").as(\"view\"),\n max(\"added_to_cart\").as(\"cart\"), \n max(\"checked_out\").as(\"checkout\"), \n max(\"completed_purchase\").as(\"purchase\")\n ).\n agg(\n sum(\"visitnum\"), \n sum(\"view\"), \n sum(\"cart\"), \n sum(\"checkout\"), \n sum(\"purchase\")\n )\n \nz.show(rollupFunnel)",
"dateUpdated": "Oct 5, 2015 11:59:25 PM",
"config": {
"tableHide": false,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "multiBarChart",
"height": 300.0,
"optionOpen": false,
"keys": [
{
"name": "sum(visitnum)",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "sum(visitnum)",
"index": 0.0,
"aggr": "sum"
},
{
"name": "sum(view)",
"index": 1.0,
"aggr": "sum"
},
{
"name": "sum(cart)",
"index": 2.0,
"aggr": "sum"
},
{
"name": "sum(checkout)",
"index": 3.0,
"aggr": "sum"
},
{
"name": "sum(purchase)",
"index": 4.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"xAxis": {
"name": "sum(visitnum)",
"index": 0.0,
"aggr": "sum"
},
"yAxis": {
"name": "sum(view)",
"index": 1.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442801306606_2089264047",
"id": "20150921-020826_181074990",
"result": {
"code": "SUCCESS",
"type": "TABLE",
"msg": "sum(visitnum)\tsum(view)\tsum(cart)\tsum(checkout)\tsum(purchase)\n5377\t3196\t1850\t1576\t733\n"
},
"dateCreated": "Sep 21, 2015 2:08:26 AM",
"dateStarted": "Oct 5, 2015 11:59:25 PM",
"dateFinished": "Oct 5, 2015 11:59:30 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
},
{
"title": "Funnel with Segmentation",
"text": "// funnel for specific set of products\n\n// we define a \"goal\" which is whether or not they viewed products in the set, but since viewing a product will happen\n// earlier in the session we use a window to keep track of the goal as the session progresses\n\nval productFunnel \u003d funnel.\n withColumn(\"goal\", when(column(\"product\").isin(\"shorts\", \"t-shirts\"), 1).otherwise(0)).\n withColumn(\"goal\", max(\"goal\").over(sessionWindowSpec))\n\n// so now we just filter the funnel based on those events that met the \"goal\" \nval rollupFunnel \u003d productFunnel.\n where(\"goal \u003d 1\").\n groupBy(\"visid\", \"visitnum\").\n agg(\n max(\"viewed_product\").as(\"view\"), \n max(\"added_to_cart\").as(\"cart\"), \n max(\"checked_out\").as(\"checkout\"), \n max(\"completed_purchase\").as(\"purchase\")\n ).\n agg(\n sum(\"visitnum\"), \n sum(\"view\"), \n sum(\"cart\"), \n sum(\"checkout\"), \n sum(\"purchase\")\n )\n\nz.show(rollupFunnel)",
"dateUpdated": "Oct 5, 2015 2:32:31 PM",
"config": {
"tableHide": false,
"colWidth": 12.0,
"editorMode": "ace/mode/scala",
"editorHide": false,
"title": true,
"graph": {
"mode": "multiBarChart",
"height": 300.0,
"optionOpen": false,
"keys": [
{
"name": "sum(visitnum)",
"index": 0.0,
"aggr": "sum"
}
],
"values": [
{
"name": "sum(visitnum)",
"index": 0.0,
"aggr": "sum"
},
{
"name": "sum(view)",
"index": 1.0,
"aggr": "sum"
},
{
"name": "sum(cart)",
"index": 2.0,
"aggr": "sum"
},
{
"name": "sum(checkout)",
"index": 3.0,
"aggr": "sum"
},
{
"name": "sum(purchase)",
"index": 4.0,
"aggr": "sum"
}
],
"groups": [],
"scatter": {
"xAxis": {
"name": "sum(visitnum)",
"index": 0.0,
"aggr": "sum"
},
"yAxis": {
"name": "sum(view)",
"index": 1.0,
"aggr": "sum"
}
}
}
},
"settings": {
"params": {},
"forms": {}
},
"jobName": "paragraph_1442779107688_-271146459",
"id": "20150920-195827_234480414",
"result": {
"code": "SUCCESS",
"type": "TABLE",
"msg": "sum(visitnum)\tsum(view)\tsum(cart)\tsum(checkout)\tsum(purchase)\n85\t80\t41\t31\t15\n"
},
"dateCreated": "Sep 20, 2015 7:58:27 PM",
"dateStarted": "Sep 22, 2015 8:12:46 PM",
"dateFinished": "Sep 22, 2015 8:12:57 PM",
"status": "FINISHED",
"progressUpdateIntervalMs": 500
}
],
"name": "DC Apache Spark Meetup",
"id": "2B1VQSD4A",
"angularObjects": {
"2AME32G1N": [],
"2AMGUAZPA": [],
"2AMPBB13H": [],
"2AM1WSWKF": [],
"2APKEWWKR": []
},
"config": {},
"info": {}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment