|
<script>window.settings = {"enableAutoCompleteAsYouType":[],"devTierName":"Community Edition","workspaceFeaturedLinks":[{"linkURI":"https://docs.cloud.databricks.com/docs/latest/databricks_guide/index.html","displayName":"Databricks Guide","icon":"question"},{"linkURI":"https://docs.cloud.databricks.com/docs/latest/sample_applications/index.html","displayName":"Application Examples","icon":"code"},{"linkURI":"https://docs.cloud.databricks.com/docs/latest/courses/index.html","displayName":"Training","icon":"graduation-cap"}],"dbcForumURL":"http://forums.databricks.com/","nodeInfo":{"node_types":[{"spark_heap_memory":4800,"instance_type_id":"r3.2xlarge","spark_core_oversubscription_factor":3.0,"node_type_id":"dev-tier-node","description":"Community Optimized","container_memory_mb":6000,"memory_mb":6144,"num_cores":0.88}],"default_node_type_id":"dev-tier-node"},"enableThirdPartyApplicationsUI":false,"enableClusterAcls":false,"notebookRevisionVisibilityHorizon":999999,"enableTableHandler":true,"isAdmin":true,"enableLargeResultDownload":true,"zoneInfos":[{"id":"us-west-2c","isDefault":true},{"id":"us-west-2b","isDefault":false},{"id":"us-west-2a","isDefault":false}],"enablePublishNotebooks":true,"enableJobAclsConfig":false,"enableFullTextSearch":false,"enableElasticSparkUI":false,"clusters":true,"allowRunOnPendingClusters":true,"applications":false,"fileStoreBase":"FileStore","configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableResetPassword":true,"enableJobsSparkUpgrade":true,"sparkVersions":[{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1 (Hadoop 1)","packageLabel":"spark-image-f710650fb8aaade8e4e812368ea87c45cd8cd0b5e6894ca6c94f3354e8daa6dc","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10-hadoop1","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"1.6.1-ubuntu15.10-hadoop1","displayName":"Spark 1.6.1 (Hadoop 1)","packageLabel":"spark-image-21d1cac181b7b8856dd1b4214a3a734f95b5289089349db9d9c926cb87d843db","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.2-ubuntu15.10-hadoop1","displayName":"Spark 1.6.2 (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.2-ubuntu15.10-hadoop2","displayName":"Spark 1.6.2 (Hadoop 2)","packageLabel":"spark-image-161245e66d887cd775e23286a54bab0b146143e1289f25bd1732beac454a1561","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.1-ubuntu15.10-hadoop2","displayName":"Spark 1.6.1 (Hadoop 2)","packageLabel":"spark-image-4cafdf8bc6cba8edad12f441e3b3f0a8ea27da35c896bc8290e16b41fd15496a","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2 (Hadoop 1)","packageLabel":"spark-image-c9d2a8abf41f157a4acc6d52bc721090346f6fea2de356f3a66e388f54481698","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0 (Hadoop 1)","packageLabel":"spark-image-40d2842670bc3dc178b14042501847d76171437ccf70613fa397a7a24c48b912","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.x-ubuntu15.10","displayName":"Spark 2.0 (RC4)","packageLabel":"spark-image-a811578657998e545688196186dfe309b95343c623fa413033fd6cfb7197b4e0","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.0-ubuntu15.10","displayName":"Spark 1.6.0 (Hadoop 1)","packageLabel":"spark-image-10ef758029b8c7e19cd7f4fb52fff9180d75db92ca071bd94c47f3c1171a7cb5","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10-hadoop2","displayName":"Spark 1.6.x (Hadoop 2)","packageLabel":"spark-image-161245e66d887cd775e23286a54bab0b146143e1289f25bd1732beac454a1561","upgradable":true,"deprecated":false,"customerVisible":false}],"enableRestrictedClusterCreation":true,"enableFeedback":true,"enableClusterAutoScaling":false,"defaultNumWorkers":0,"serverContinuationTimeoutMillis":10000,"driverStderrFilePrefix":"stderr","enableNotebookRefresh":false,"driverStdoutFilePrefix":"stdout","enableSparkDocsSearch":true,"sparkHistoryServerEnabled":true,"sanitizeMarkdownHtml":true,"enableIPythonImportExport":true,"enableNotebookHistoryDiffing":true,"branch":"2.22.2","accountsLimit":3,"enableNotebookGitBranching":true,"local":false,"enableStrongPassword":false,"displayDefaultContainerMemoryGB":6,"deploymentMode":"production","useSpotForWorkers":true,"enableUserInviteWorkflow":true,"enableStaticNotebooks":true,"enableCssTransitions":true,"showHomepageFeaturedLinks":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"notifyLastLogin":false,"enableNotebookGitVersioning":true,"files":"files/","enableDriverLogsUI":true,"disableLegacyDashboards":true,"enableWorkspaceAclsConfig":false,"dropzoneMaxFileSize":4096,"enableNewDashboardViews":true,"driverLog4jFilePrefix":"log4j","enableSingleSignOn":true,"enableMavenLibraries":true,"displayRowLimit":1000,"defaultSparkVersion":{"key":"1.6.1-ubuntu15.10-hadoop1","displayName":"Spark 1.6.1 (Hadoop 1)","packageLabel":"spark-image-21d1cac181b7b8856dd1b4214a3a734f95b5289089349db9d9c926cb87d843db","upgradable":true,"deprecated":false,"customerVisible":true},"enableMountAclsConfig":false,"enableClusterAclsByTier":false,"disallowAddingAdmins":true,"enableSparkConfUI":true,"featureTier":"DEVELOPER_BASIC_TIER","enableOrgSwitcherUI":true,"clustersLimit":1,"enableJdbcImport":true,"logfiles":"logfiles/","enableWebappSharding":true,"enableClusterDeltaUpdates":true,"enableSingleSignOnLogin":false,"useFixedStaticNotebookVersionForDevelopment":false,"enableMountAcls":false,"requireEmailUserName":true,"enableDashboardViews":false,"dbcFeedbackURL":"mailto:feedback@databricks.com","enableMountAclService":true,"enableWorkspaceAclService":true,"enableWorkspaceAcls":false,"gitHash":"","showWorkspaceFeaturedLinks":true,"signupUrl":"https://databricks.com/try-databricks","allowFeedbackForumAccess":true,"enableImportFromUrl":true,"enableMiniClusters":true,"showDevTierBetaVersion":true,"enableDebugUI":false,"allowNonAdminUsers":true,"enableSingleSignOnByTier":false,"staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/","enableSparkPackages":true,"dynamicSparkVersions":true,"enableNotebookHistoryUI":true,"showDebugCounters":false,"enableFolderHtmlExport":true,"enableSparkVersionsUI":true,"homepageFeaturedLinks":[{"linkURI":"https://docs.cloud.databricks.com/docs/latest/featured_notebooks/A%20Gentle%20Introduction%20to%20Apache%20Spark%20on%20Databricks.html","displayName":"Introduction to Apache Spark on Databricks","icon":"img/home/Python_icon.svg"},{"linkURI":"https://docs.cloud.databricks.com/docs/latest/featured_notebooks/Quick%20Start%20DataFrames.html","displayName":"Quick Start DataFrames","icon":"img/home/Scala_icon.svg"},{"linkURI":"https://docs.cloud.databricks.com/docs/latest/featured_notebooks/GSW%20Passing%20Analysis%20(new).html","displayName":"GSW Passing Analysis (new)","icon":"img/home/Python_icon.svg"}],"upgradeURL":"https://accounts.cloud.databricks.com/registration.html#login","notebookLoadingBackground":"#fff","enableServerAutoComplete":true,"enableStaticHtmlImport":true,"enableTerminal":false,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"accounts":true,"useFramedStaticNotebooks":true,"enableNewProgressReportUI":true,"defaultCoresPerContainer":4};</script> |
|
<script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":2054245663926531,"name":"Scala_Demo","language":"scala","commands":[{"version":"CommandV1","origId":2054245663926533,"guid":"0ba80766-cee6-442a-9718-9a9f341e9d8f","subtype":"command","commandType":"auto","position":1.0,"command":"val ENCODED_SECRET_KEY = SECRET_KEY.replace(\"/\", \"%2F\")\nval AWS_BUCKET_NAME = \"4quant-devpost\"\nval MOUNT_NAME = \"flat_medical/cost_usage_records\"","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">AWS_BUCKET_NAME: String = 4quant-devpost\nMOUNT_NAME: String = flat_medical//cost_usage_records\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1.46920680789E12,"submitTime":1.469206808099E12,"finishTime":1.46920680999E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"97b54075-2ae9-4b82-a3f7-faac89bb1aca"},{"version":"CommandV1","origId":2054245663926534,"guid":"c8e7bff8-cc8b-4b72-9667-9fb7a686a9fd","subtype":"command","commandType":"auto","position":2.0,"command":"val all_med_records = sqlContext.read.json(s\"s3a://$ACCESS_KEY:$ENCODED_SECRET_KEY@$AWS_BUCKET_NAME/$MOUNT_NAME\")","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">all_med_records: org.apache.spark.sql.DataFrame = [_1: string, _2: bigint ... 1 more field]\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"Cancelled","error":null,"workflows":[],"startTime":1.469206812952E12,"submitTime":1.469206813149E12,"finishTime":1.469207003702E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"Load the Medical Cost and Usage Record Data","showCommandTitle":true,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"7877ba7d-7d65-4fca-9409-e78a0cb6c4b3"},{"version":"CommandV1","origId":2054245663926535,"guid":"2a8c56a6-9730-4c07-94b3-0f8dba99125e","subtype":"command","commandType":"auto","position":3.0,"command":"all_med_records.first()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res0: org.apache.spark.sql.Row = [Complications - National.csv,43236,WrappedArray(["Measure Name","Deaths among Patients with Serious Treatable Complications after Surgery"], ["Measure ID","PSI_4_SURG_COMP"], ["National Rate","117.75"], ["Number of Hospitals Worse","65"], ["Number of Hospitals Same","1730"], ["Number of Hospitals Better","45"], ["Number of Hospitals Too Few","1028"], ["Footnote",""], ["Measure Start Date","07/01/2012"], ["Measure End Date"\r,"06/30/2014"\r])]\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1.469207012895E12,"submitTime":1.469207013097E12,"finishTime":1.469207018883E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"55ff6c14-9e7a-4e9a-8194-1ae5a47a2d06"},{"version":"CommandV1","origId":2054245663926551,"guid":"df97a707-0a80-4204-a592-31dfc384f8ce","subtype":"command","commandType":"auto","position":3.5,"command":"all_med_records.createOrReplaceTempView(\"Medical_Summary\")","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1.469207018888E12,"submitTime":1.469207018947E12,"finishTime":1.46920701898E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"83925c04-0169-456c-b585-73f3ef80ffe7"},{"version":"CommandV1","origId":2054245663926552,"guid":"bb82a9be-6e70-4e9f-9c78-3b52e913d514","subtype":"command","commandType":"auto","position":3.75,"command":"// simple query from the table\nsqlContext.sql(\"\"\"\nSELECT * FROM Medical_Summary LIMIT 1\n\"\"\").collect()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res2: Array[org.apache.spark.sql.Row] = Array([Complications - National.csv,43236,WrappedArray(["Measure Name","Deaths among Patients with Serious Treatable Complications after Surgery"], ["Measure ID","PSI_4_SURG_COMP"], ["National Rate","117.75"], ["Number of Hospitals Worse","65"], ["Number of Hospitals Same","1730"], ["Number of Hospitals Better","45"], ["Number of Hospitals Too Few","1028"], ["Footnote",""], ["Measure Start Date","07/01/2012"], ["Measure End Date"\r,"06/30/2014"\r])])\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1.469207049868E12,"submitTime":1.469207050079E12,"finishTime":1.469207052227E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"fd2499c8-78df-4a99-aa38-c612e19cc140"},{"version":"CommandV1","origId":2054245663926553,"guid":"602eed89-d5f3-4d04-a77b-95410ef96535","subtype":"command","commandType":"auto","position":3.875,"command":"// reformat the table to the expanded format\nsqlContext.sql(\"\"\"\nSELECT _1 sourceName, _2 rowNumber, med_key._1 fieldName, med_key._2 fieldValue\nFROM Medical_Summary LATERAL VIEW explode(_3) Med_Row AS med_key\n\"\"\").createOrReplaceTempView(\"Medical_Extended_Summary\")\n","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"org.apache.spark.sql.AnalysisException: cannot resolve 'explode(medical_summary.`_2`)' due to data type mismatch: input to function explode should be array or map type, not LongType; line 3 pos 21","error":"<div class=\"ansiout\">\tat org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:82)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:74)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)\n\tat org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:300)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionUp$1(QueryPlan.scala:190)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2(QueryPlan.scala:200)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$5.apply(QueryPlan.scala:209)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:179)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:209)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:74)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:67)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125)\n\tat scala.collection.immutable.List.foreach(List.scala:318)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:67)\n\tat org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:58)\n\tat org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)\n\tat org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:580)\n\tat org.apache.spark.sql.SQLContext.sql(SQLContext.scala:682)</div>","workflows":[],"startTime":1.469207067939E12,"submitTime":1.469207068146E12,"finishTime":1.469207068028E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6a410af3-1700-4c32-9b5a-2c591d72dd5e"},{"version":"CommandV1","origId":2054245663926543,"guid":"f8266ddf-03be-41f4-a4b9-250d9073b7c5","subtype":"command","commandType":"auto","position":4.0,"command":"// find all of the state categories and show the number of entries for each state\nsqlContext.sql(\"\"\"\nSELECT fieldValue StateName, COUNT(*) Appearences FROM Medical_Extended_Summary WHERE fieldName == \"STATE\" GROUP BY fieldValue LIMIT 1\n\"\"\").createOrReplaceTempView(\"State_Medical_Extended_Summary\")","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"The spark driver has stopped unexpectedly and is restarting. Your notebook will be automatically reattached.","error":null,"workflows":[],"startTime":1.469207129575E12,"submitTime":1.469207129786E12,"finishTime":1.469207129667E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"e57569f4-5ec4-49a4-b903-a58d53d5fb61"},{"version":"CommandV1","origId":2054245663926544,"guid":"8231b575-7a3a-4d4e-87eb-80c500e13164","subtype":"command","commandType":"auto","position":5.0,"command":"val all_med_keywords = sqlContext.read.json(s\"s3a://$ACCESS_KEY:$ENCODED_SECRET_KEY@$AWS_BUCKET_NAME/flat_medical/publication_keywords\")","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">all_med_keywords: org.apache.spark.sql.DataFrame = [abstract: string, affiliation: string ... 6 more fields]\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"Cancelled","error":null,"workflows":[],"startTime":1.469207338529E12,"submitTime":1.469207338743E12,"finishTime":1.469207362796E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"Load the Keyword Data","showCommandTitle":true,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"3f32aaf5-068b-4246-977a-c636f6f6e520"},{"version":"CommandV1","origId":2054245663926569,"guid":"33ab0d18-704c-4170-b170-28f0a75a44c5","subtype":"command","commandType":"auto","position":5.5,"command":"all_med_keywords.columns","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res12: Array[String] = Array(abstract, affiliation, journal, keyword, month, source, title, year)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1.469207490413E12,"submitTime":1.469207490625E12,"finishTime":1.469207490466E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b2704eab-138d-4709-af52-c5d91a962c2b"},{"version":"CommandV1","origId":2054245663926545,"guid":"9af5542e-1ccf-4858-9225-753c6444907d","subtype":"command","commandType":"auto","position":6.0,"command":"all_med_keywords.createOrReplaceTempView(\"Medical_Research_Keywords\")","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<div class=\"ansiout\"><console>:44: error: not found: value cur_row\nval $ires29 = cur_row\n ^\n<console>:40: error: not found: value cur_row\n cur_row = all_med_keywords.first()\n ^\n</div>","error":null,"workflows":[],"startTime":1.469207468362E12,"submitTime":1.469207468576E12,"finishTime":1.469207468434E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"8fd2d6e2-0cde-4648-91c4-44de1281135f"},{"version":"CommandV1","origId":2054245663926546,"guid":"1b99364b-c078-4180-acca-753768e5ecfb","subtype":"command","commandType":"auto","position":7.0,"command":"sqlContext.sql(\"\"\"\nSELECT keyword, MIN(year), MAX(year), COUNT(*) Occurences FROM Medical_Research_Keywords GROUP BY keyword\n\"\"\").first()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res16: org.apache.spark.sql.Row = [,,,88]\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"java.util.NoSuchElementException: next on empty iterator","error":"<div class=\"ansiout\">\tat scala.collection.Iterator$$anon$2.next(Iterator.scala:39)\n\tat scala.collection.Iterator$$anon$2.next(Iterator.scala:37)\n\tat scala.collection.IndexedSeqLike$Elements.next(IndexedSeqLike.scala:64)\n\tat scala.collection.IterableLike$class.head(IterableLike.scala:91)\n\tat scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$head(ArrayOps.scala:108)\n\tat scala.collection.IndexedSeqOptimized$class.head(IndexedSeqOptimized.scala:120)\n\tat scala.collection.mutable.ArrayOps$ofRef.head(ArrayOps.scala:108)\n\tat org.apache.spark.sql.Dataset.head(Dataset.scala:1933)\n\tat org.apache.spark.sql.Dataset.first(Dataset.scala:1940)</div>","workflows":[],"startTime":1.469207597264E12,"submitTime":1.46920759748E12,"finishTime":1.469207625039E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b37deea3-0aaf-4294-98e8-5a11de31d529"}],"dashboards":[],"guid":"46f3cef5-df05-4e90-a0a4-299397c884a2","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script> |
Hi,
I want to know how to download the data .