sujitpal/wmd_short_sentences_scala.html

## wmd_short_sentences_scala.html
<!DOCTYPE html>
<html>
<head>
  <meta name="databricks-html-version" content="1">
<title>wmd_short_sentences - Databricks</title>

<meta charset="utf-8">
<meta name="google" content="notranslate">
<meta http-equiv="Content-Language" content="en">
<meta http-equiv="Content-Type" content="text/html; charset=UTF8">
<link rel="stylesheet"
  href="https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700">

<link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201604171002070700-c4c797e204f33fa939fad994617c60c2ea182ad5/lib/css/bootstrap.min.css">
<link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201604171002070700-c4c797e204f33fa939fad994617c60c2ea182ad5/lib/jquery-ui-bundle/jquery-ui.min.css">
<link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201604171002070700-c4c797e204f33fa939fad994617c60c2ea182ad5/css/main.css">
<link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201604171002070700-c4c797e204f33fa939fad994617c60c2ea182ad5/css/print.css" media="print">
<link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201604171002070700-c4c797e204f33fa939fad994617c60c2ea182ad5/img/favicon.ico"/>
<script>window.settings = {"enableAutoCompleteAsYouType":[],"devTierName":"Community Edition","workspaceFeaturedLinks":[{"linkURI":"https://docs.cloud.databricks.com/docs/latest/databricks_guide/index.html","displayName":"Databricks Guide","icon":"question"},{"linkURI":"https://docs.cloud.databricks.com/docs/latest/sample_applications/index.html","displayName":"Application Examples","icon":"code"},{"linkURI":"https://docs.cloud.databricks.com/docs/latest/courses/index.html","displayName":"Training","icon":"graduation-cap"}],"sparkDocsSearchGoogleCx":"004588677886978090460:_rj0wilqwdm","dbcForumURL":"http://forums.databricks.com/","dbfsS3Host":"https://databricks-prod-storage-virginia.s3.amazonaws.com","nodeInfo":{"node_types":[{"spark_heap_memory":23800,"instance_type_id":"r3.2xlarge","node_type_id":"memory-optimized","description":"Memory Optimized","container_memory_mb":28000,"memory_mb":30720,"num_cores":4.0},{"spark_heap_memory":9702,"instance_type_id":"c3.4xlarge","node_type_id":"compute-optimized","description":"Compute Optimized","container_memory_mb":12128,"memory_mb":15360,"num_cores":8.0}],"default_node_type_id":"memory-optimized"},"enableThirdPartyApplicationsUI":false,"enableClusterAcls":true,"notebookRevisionVisibilityHorizon":0,"enableTableHandler":true,"isAdmin":false,"enableLargeResultDownload":true,"zoneInfos":[{"id":"us-east-1a","isDefault":true},{"id":"us-east-1c","isDefault":false}],"nameAndEmail":"Sujit Pal (sujit.pal@elsevier.com)","enablePublishNotebooks":false,"enableLegacySQLWidgets":true,"enableJobAclsConfig":false,"enableFullTextSearch":true,"enableElasticSparkUI":true,"clusters":false,"allowRunOnPendingClusters":true,"hideOffHeapCache":false,"applications":false,"useStaticGuide":false,"fileStoreBase":"FileStore","configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableResetPassword":true,"enableJobsSparkUpgrade":true,"sparkVersions":[{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0 (Hadoop 1)","packageLabel":"spark-1.3-jenkins-ip-10-30-10-144-Uc4c797e204-Sa2ee4664b2-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1 (Hadoop 1)","packageLabel":"spark-1.4-jenkins-ip-10-30-10-144-Uc4c797e204-S9c254ab12a-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2 (Hadoop 1)","packageLabel":"spark-1.5-jenkins-ip-10-30-10-144-Uc4c797e204-S9ca52d000d-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.0-ubuntu15.10","displayName":"Spark 1.6.0 (Hadoop 1)","packageLabel":"spark-1.6.0-jenkins-ip-10-30-10-144-Uc4c797e204-Sf90f83597b-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.1-ubuntu15.10-hadoop1","displayName":"Spark 1.6.1 (Hadoop 1)","packageLabel":"spark-1.6.1-hadoop1-jenkins-ip-10-30-10-144-Uc4c797e204-S3a9b9c8fce-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.1-ubuntu15.10-hadoop2","displayName":"Spark 1.6.1 (Hadoop 2)","packageLabel":"spark-1.6.1-hadoop2-jenkins-ip-10-30-10-144-Uc4c797e204-S3a9b9c8fce-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-1.6.1-hadoop1-jenkins-ip-10-30-10-144-Uc4c797e204-S3a9b9c8fce-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"1.6.x-ubuntu15.10-hadoop1","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-1.6.1-hadoop1-jenkins-ip-10-30-10-144-Uc4c797e204-S3a9b9c8fce-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"1.6.x-ubuntu15.10-hadoop2","displayName":"Spark 1.6.x (Hadoop 2)","packageLabel":"spark-1.6.1-hadoop2-jenkins-ip-10-30-10-144-Uc4c797e204-S3a9b9c8fce-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"master","displayName":"Spark master (dev)","packageLabel":"","upgradable":true,"deprecated":false,"customerVisible":false}],"enableRestrictedClusterCreation":false,"enableFeedback":false,"enableClusterAutoScaling":false,"defaultNumWorkers":8,"defaultZoneId":"us-east-1a","serverContinuationTimeoutMillis":10000,"driverStderrFilePrefix":"stderr","enableNotebookRefresh":true,"driverStdoutFilePrefix":"stdout","enableSparkDocsSearch":true,"prefetchSidebarNodes":true,"sparkHistoryServerEnabled":true,"sanitizeMarkdownHtml":true,"enableIPythonImportExport":true,"enableNotebookHistoryDiffing":true,"branch":"2.17.2","accountsLimit":-1,"enableNotebookGitBranching":true,"local":false,"enableStrongPassword":false,"displayDefaultContainerMemoryGB":30,"deploymentMode":"production","useSpotForWorkers":false,"enableUserInviteWorkflow":false,"enableStaticNotebooks":true,"dbcGuideURL":"#workspace/databricks_guide/00 Welcome to Databricks","enableCssTransitions":true,"showHomepageFeaturedLinks":false,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"orgId":0,"enableNotebookGitVersioning":true,"files":"files/","enableDriverLogsUI":true,"disableLegacyDashboards":false,"enableWorkspaceAclsConfig":true,"dropzoneMaxFileSize":4096,"enableNewDashboardViews":true,"driverLog4jFilePrefix":"log4j","enableMavenLibraries":true,"displayRowLimit":1000,"defaultSparkVersion":{"key":"1.6.0-ubuntu15.10","displayName":"Spark 1.6.0 (Hadoop 1)","packageLabel":"spark-1.6.0-jenkins-ip-10-30-10-144-Uc4c797e204-Sf90f83597b-2016-04-17-17:58:35.805047","upgradable":true,"deprecated":false,"customerVisible":true},"enableMountAclsConfig":false,"clusterPublisherRootId":86382,"enableLatestJobRunResultPermalink":true,"enableClusterAclsByTier":true,"disallowAddingAdmins":false,"enableSparkConfUI":true,"featureTier":"UNKNOWN_TIER","enableOrgSwitcherUI":false,"clustersLimit":-1,"enableJdbcImport":true,"logfiles":"logfiles/","enableWebappSharding":false,"enableClusterDeltaUpdates":true,"csrfToken":"4114db71-5345-493a-b4fb-b6932d22212f","useFixedStaticNotebookVersionForDevelopment":false,"enableMountAcls":false,"requireEmailUserName":true,"enableDashboardViews":false,"dbcFeedbackURL":"http://feedback.databricks.com/forums/263785-product-feedback","enableMountAclService":true,"enableWorkspaceAclService":true,"someName":"Sujit Pal","enableWorkspaceAcls":true,"gitHash":"c4c797e204f33fa939fad994617c60c2ea182ad5","showWorkspaceFeaturedLinks":false,"userFullname":"Sujit Pal","allowFeedbackForumAccess":true,"enableImportFromUrl":true,"sessionIdleTimeout":-1,"enableMiniClusters":false,"showDevTierBetaVersion":false,"enableDebugUI":false,"showHiddenSparkVersions":false,"allowNonAdminUsers":true,"userId":100043,"dbcSupportURL":"http://help.databricks.com","staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/201604171002070700-c4c797e204f33fa939fad994617c60c2ea182ad5/","enableSparkPackages":true,"enableHybridClusterType":true,"enableNotebookHistoryUI":true,"availableWorkspaces":[{"name":"Workspace 0","orgId":0}],"enableFolderHtmlExport":true,"enableSparkVersionsUI":true,"homepageFeaturedLinks":[{"linkURI":"https://docs.cloud.databricks.com/docs/latest/featured_notebooks/1%20QuickStart%20Notebooks%20(Python).html","displayName":"Getting Started","icon":"img/home/Python_icon.svg"},{"linkURI":"https://docs.cloud.databricks.com/docs/latest/featured_notebooks/3%20QuickStart%20DataFrames%20(Scala).html","displayName":"Introduction to DataFrames","icon":"img/home/Scala_icon.svg"},{"linkURI":"https://docs.cloud.databricks.com/docs/latest/featured_notebooks/DecisionTrees-Example.html","displayName":"Decision Trees in Spark","icon":"img/home/Scala_icon.svg"}],"databricksGuideStaticUrl":"","enableHybridClusters":true,"enableSessionIdleDetection":true,"upgradeURL":"","notebookLoadingBackground":"#fff","enableNewJobRunDetailsPage":true,"user":"sujit.pal@elsevier.com","enableServerAutoComplete":true,"enableStaticHtmlImport":true,"defaultMemoryPerContainerMB":28000,"enablePresenceUI":true,"tablesPublisherRootId":86388,"enableNewInputWidgetUI":true,"accounts":false,"useFramedStaticNotebooks":true,"enableNewProgressReportUI":true,"defaultCoresPerContainer":4};</script>
<script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":198496,"name":"wmd_short_sentences","language":"scala","commands":[{"version":"CommandV1","origId":198498,"guid":"2a5bf827-66da-4021-b487-5783294af2d0","subtype":"command","commandType":"auto","position":1.0,"command":"import org.apache.spark.storage.StorageLevel\n\nval sentencePairs = sc.textFile(\"/path/to/sentence_pairs.txt\")\n    .map(line => {\n        val Array(s1, s2, _) = line.split('\\t')\n        (s1, s2)\n    })\n    .zipWithIndex\n    .persist(StorageLevel.MEMORY_AND_DISK)\nsentencePairs.count()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">import org.apache.spark.storage.StorageLevel\nsentencePairs: org.apache.spark.rdd.RDD[((String, String), Long)] = ZippedWithIndexRDD[305] at zipWithIndex at &lt;console&gt;:105\nres46: Long = 16\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<div class=\"ansiout\">&lt;console&gt;:31: error: value swap is not a member of org.apache.spark.rdd.RDD[((String, String), Long)]\npossible cause: maybe a semicolon is missing before `value swap'?\n           .swap\n            ^\n</div>","error":null,"workflows":[],"startTime":1.443413387038E12,"submitTime":1.443413386764E12,"finishTime":1.443413387701E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"sujit.pal@elsevier.com","commandTitle":null,"showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"0981afd8-b407-4036-ba42-bb27751c69dc"},{"version":"CommandV1","origId":198499,"guid":"14cdd2c2-bdbf-43f9-8e4a-a7def948be26","subtype":"command","commandType":"auto","position":2.0,"command":"val stopwords = sc.textFile(\"/path/to/stopwords.txt\")\n    .collect\n    .toSet\nval bStopwords = sc.broadcast(stopwords)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">stopwords: scala.collection.immutable.Set[String] = Set(down, it's, ourselves, that's, for, further, she'll, any, there's, this, haven't, in, ought, myself, have, your, off, once, i'll, are, is, his, why, too, why's, am, than, isn't, didn't, himself, but, you're, below, what, would, i'd, if, you'll, own, they'll, up, we're, they'd, so, our, do, all, him, had, nor, before, it, a, she's, as, hadn't, because, has, she, yours, or, above, yourself, herself, she'd, such, they, each, can't, don't, i, until, that, out, he's, cannot, to, we've, hers, you, did, let's, most, here, these, hasn't, was, there, when's, shan't, doing, at, through, been, over, i've, on, being, same, how, whom, my, after, who, itself, me, them, by, then, couldn't, he, should, few, wasn't, again, while, their, not, with, from, you've, they've, what's, wouldn't, both, could, its, under, which, you'd, an, be, here's, into, where, he'll, her, themselves, were, more, we'd, where's, they're, who's, between, aren't, ours, about, doesn't, how's, against, during, no, very, we, having, mustn't, some, does, when, shouldn't, yourselves, he'd, other, of, weren't, and, won't, theirs, i'm, we'll, the, those, only)\nbStopwords: org.apache.spark.broadcast.Broadcast[scala.collection.immutable.Set[String]] = Broadcast(188)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1.443413387705E12,"submitTime":1.443413386799E12,"finishTime":1.443413388252E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"sujit.pal@elsevier.com","commandTitle":null,"showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"42fc0459-e0b8-49ca-8429-e7b07fe170be"},{"version":"CommandV1","origId":198500,"guid":"e038583d-24b7-4846-a1e2-5b73901d0fae","subtype":"command","commandType":"auto","position":2.5,"command":"def getWordPairs(id: Long, s1: String, s2: String, stopwords: Set[String]): \n        List[(Long, (String, String))] = {\n    val w1s = s1.toLowerCase\n          .replaceAll(\"\\\\p{Punct}\", \"\")\n          .split(\" \")\n          .filter(w => !stopwords.contains(w))\n    val w2s = s2.toLowerCase\n          .replaceAll(\"\\\\p{Punct}\", \"\")\n          .split(\" \")\n          .filter(w => !stopwords.contains(w))\n    val wpairs = for (w1 <- w1s; w2 <- w2s) yield (id, (w1, w2))\n    wpairs.toList\n}\n\nval wordPairs = sentencePairs.flatMap(ssi => \n    getWordPairs(ssi._2, ssi._1._1, ssi._1._2, bStopwords.value))\nwordPairs.take(10)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">getWordPairs: (id: Long, s1: String, s2: String, stopwords: Set[String])List[(Long, (String, String))]\nwordPairs: org.apache.spark.rdd.RDD[(Long, (String, String))] = MapPartitionsRDD[308] at flatMap at &lt;console&gt;:128\nres47: Array[(Long, (String, String))] = Array((0,(like,like)), (0,(like,unmarried)), (0,(like,man)), (0,(bachelor,like)), (0,(bachelor,unmarried)), (0,(bachelor,man)), (1,(john,john)), (1,(john,nice)), (1,(nice,john)), (1,(nice,nice)))\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<div class=\"ansiout\">&lt;console&gt;:39: error: type mismatch;\n found   : Array[(Long, (String, String))]\n required: List[(Long, (String, String))]\n                  wpairs\n                  ^\n</div>","error":null,"workflows":[],"startTime":1.443413388256E12,"submitTime":1.443413386831E12,"finishTime":1.443413389139E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"sujit.pal@elsevier.com","commandTitle":null,"showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"1698655b-8145-46fc-9d61-4edbcd1dc2fd"},{"version":"CommandV1","origId":198501,"guid":"710dd2fa-2c37-4030-a526-167d7204b4a6","subtype":"command","commandType":"auto","position":3.0,"command":"val w2vs = sc.textFile(\"/path/to/GoogleNews-vectors-negative300.tsv\")\n    .map(line => {\n        val Array(word, vector) = line.split('\\t')\n        (word, vector)\n    })\nw2vs.take(1)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">w2vs: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[311] at map at &lt;console&gt;:101\nres48: Array[(String, String)] = Array((Allanah_Munson,0.0777719,0.0396534,-0.0992615,0.00745101,0.0234083,-0.105401,-0.00153497,-0.111541,-0.022385,0.0616547,0.0762369,-0.000443703,-0.029932,-0.0067155,0.0859584,-0.0437467,0.0675388,-0.00450898,-0.0915867,-0.0110646,-0.00498866,0.0509099,-0.0202105,0.00390139,0.0327461,0.0368393,-0.0583289,0.0690737,0.0161172,0.0225129,0.00219053,-0.114611,0.0839118,-0.124844,-0.0365835,-0.0798185,-0.0184197,0.0289086,0.0246875,0.0300599,0.108471,-0.0419559,0.00780277,-0.0639572,0.0498866,-0.0422117,-0.0100413,-0.0228967,0.130473,0.10796,0.0383743,0.0585848,-0.0158614,-0.154521,-0.0834001,0.0736787,-0.00613989,-0.0330019,-0.0629338,-0.0767486,-0.0598639,0.0603756,-0.135078,0.0276295,0.00302198,-0.00684342,0.02392,-0.00409326,0.016373,0.134054,0.0450258,-0.0106169,0.0393976,-0.0782836,-0.0972149,0.0138147,0.0424676,-0.0110646,0.0818652,0.0637013,-0.034281,-0.0767486,0.0085063,-0.0285249,0.0654921,-0.00997732,-0.110006,0.105401,0.0264783,-0.0951683,0.00305395,-0.0665155,-0.118193,0.0598639,-9.14388e-05,-0.0613989,0.0578173,-0.0588406,-0.0869817,-0.00476481,-0.00908192,0.0166289,0.0488633,0.0905633,0.0424676,-0.056794,-0.071632,0.000951363,-0.00422117,0.0793069,0.103355,-0.0353044,0.0434909,0.0340252,-0.00620385,0.034281,-0.04477,-0.0213617,0.0424676,-0.0237921,-0.0936333,0.0327461,-0.0360718,0.0437467,-0.0200826,-0.0588406,-0.0248154,0.0115762,-0.0532124,0.124333,-0.0787952,-0.0386301,0.022385,0.0353044,-0.0327461,-0.0309553,-0.0417001,-0.0211059,-0.0690737,0.0660038,-0.0486074,-0.0967032,-0.00476481,-0.0273737,0.0185476,0.0427234,-0.00334176,0.00946566,-0.0859584,-0.00719518,0.141217,0.108471,-0.0616547,0.00789871,-0.0264783,-0.0360718,0.0967032,-0.0127914,0.0278853,-0.0445142,-0.00318187,0.052189,0.140194,0.0401651,-0.0808419,-0.00399732,0.0337694,-0.0470725,0.0125996,0.0363277,-0.0598639,0.00991336,0.013495,-0.074702,0.0480958,-0.0690737,0.1141,-0.0700971,-0.103355,0.0555148,-0.0156055,0.0207221,0.103866,-0.00127115,-0.0212338,-0.0793069,0.038886,0.00107928,0.0527007,-0.0514216,0.0200826,0.0131752,0.0340252,0.0470725,-0.133031,0.0596081,-0.0409326,-0.103866,0.00441304,-0.0511657,-0.0251991,0.0235362,0.0685621,0.0232804,0.0106169,-0.110518,0.0171405,0.00933775,-0.00757892,-0.123309,-0.00249433,-0.0956799,-0.00901796,0.0798185,0.0156055,0.0511657,0.0383743,0.0496308,0.0131752,-0.11103,0.00454096,0.0946566,0.00350165,0.0644688,0.014902,-0.0424676,0.0111925,-0.00789871,0.0035816,-0.0321065,-0.0580731,0.0920983,-0.0480958,-0.0167568,-0.0383743,0.0695854,-0.0813535,-0.0241758,-0.0249433,0.106936,-0.00527647,0.00338973,0.0834001,-0.135078,-0.0141345,0.0318507,-0.0168847,0.0243037,0.0406768,-0.0305715,0.00604395,0.0113844,0.0544915,0.0619105,0.0511657,-0.0235362,0.0202105,-0.000979344,0.0161172,-0.11103,0.04477,0.0241758,-0.0675388,-0.0323623,0.0332577,0.0445142,-0.0170126,0.0956799,-0.0473283,-0.0255829,-0.038886,-0.00441304,-0.0741903,0.0601197,0.0823768,0.0483516,0.00383743,-0.0491191,0.0532124,0.0180359,-0.0155416,0.0153497,0.035816,0.0150939,-0.0404209,-0.0424676,0.0613989,0.00422117,0.118704,0.0475841,0.0519332,-0.0721437,-0.0465608,-0.00114323,-0.0131752,-0.0470725,-0.00876213,-0.04477,0.00959357,0.0411884))\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1.443413389142E12,"submitTime":1.443413386862E12,"finishTime":1.44341338967E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"sujit.pal@elsevier.com","commandTitle":null,"showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"0cb80306-cffe-468c-8394-4fe11adeaea9"},{"version":"CommandV1","origId":198502,"guid":"b054a135-2ea6-4f49-b43d-be3d672cde5c","subtype":"command","commandType":"auto","position":4.0,"command":"import breeze.linalg._\n\ndef dist(lvec: String, rvec: String): Double = {\n    val lv = DenseVector(lvec.split(',').map(_.toDouble))\n    val rv = DenseVector(rvec.split(',').map(_.toDouble))\n    math.sqrt(sum((lv - rv) :* (lv - rv)))\n    //1.0D - lv.dot(rv) / (norm(lv) * norm(rv))\n}\n\nval wordVectors = wordPairs.map({case (idx, (lword, rword)) => (rword, (idx, lword))})\n    .join(w2vs)    // (rword, ((idx, lword), rvec))\n    .map({case (rword, ((idx, lword), rvec)) => (lword, (idx, rvec))})\n    .join(w2vs)    // (lword, ((idx, rvec), lvec))\n    .map({case (lword, ((idx, rvec), lvec)) => ((idx, lword), (lvec, rvec))})\n    .map({case ((idx, lword), (lvec, rvec)) => ((idx, lword), List(dist(lvec, rvec)))}) \n                                                  // (idx, lword), [dist]\n    .persist(StorageLevel.MEMORY_AND_DISK)\nwordVectors.take(10)\n","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">import breeze.linalg._\ndist: (lvec: String, rvec: String)Double\nwordVectors: org.apache.spark.rdd.RDD[((Long, String), List[Double])] = MapPartitionsRDD[321] at map at &lt;console&gt;:140\nres49: Array[((Long, String), List[Double])] = Array(((6,cider),List(1.2128028644278406)), ((6,cider),List(1.0704880832457455)), ((6,cider),List(1.054342437565756)), ((6,cider),List(1.3354776621120854)), ((2,alcoholic),List(1.1656626788192517)), ((3,alcoholic),List(1.349713033594998)), ((3,alcoholic),List(1.413874492377081)), ((5,alcoholic),List(1.413874492377081)), ((2,alcoholic),List(1.1816945013631714)), ((3,alcoholic),List(1.2132338607616129)))\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"Cancelled","error":null,"workflows":[],"startTime":1.443413389673E12,"submitTime":1.443413386895E12,"finishTime":1.443413461609E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"sujit.pal@elsevier.com","commandTitle":null,"showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b0ed6a7f-31d8-4612-9f1f-a1430102ba4c"},{"version":"CommandV1","origId":198503,"guid":"a6c846ea-8fc7-43c3-b8ed-efe03df31dea","subtype":"command","commandType":"auto","position":5.0,"command":"val bestWMDs = wordVectors.reduceByKey((a, b) => a ++ b)\n    .mapValues(dists => dists.sortWith(_ < _).head)  // get dist to closest word\n    .map({case ((idx, lword), wmd) => (idx, wmd)})\n    .reduceByKey((a, b) => a + b)                    // sum all wmds across sentence pair\nbestWMDs.take(10)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">bestWMDs: org.apache.spark.rdd.RDD[(Long, Double)] = ShuffledRDD[325] at reduceByKey at &lt;console&gt;:142\nres50: Array[(Long, Double)] = Array((0,1.176742725809037), (1,0.0), (2,3.386809492524872), (3,3.1161814560971166), (4,4.106139922327307), (5,3.505168296314785), (6,2.2169259719396095), (7,0.0), (8,1.3798001799052624), (9,1.04864558369858))\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"Command skipped","error":null,"workflows":[],"startTime":1.443413461613E12,"submitTime":1.443413386913E12,"finishTime":1.443413471967E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"sujit.pal@elsevier.com","commandTitle":null,"showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"006f59c9-e1da-4be0-9ea9-5e3ca17036b3"},{"version":"CommandV1","origId":198504,"guid":"db1b3912-9ed5-48b7-813f-fd285e7a31e8","subtype":"command","commandType":"auto","position":6.0,"command":"val sqlContext = new org.apache.spark.sql.SQLContext(sc)\nimport sqlContext.implicits._\n\ncase class SentencePair(s1: String, s2: String, wmd: Double)\nval results = sentencePairs.map(_.swap)\n    .join(bestWMDs)\n    .map({case (id, ((s1, s2), wmd)) => SentencePair(s1, s2, wmd)})\nval resultsDF = sqlContext.createDataFrame(results)\n    .orderBy($\"s1\".asc, $\"wmd\".asc)\ndisplay(resultsDF)","commandVersion":0,"state":"finished","results":{"type":"table","data":[["A glass of cider.","A full cup of apple juice.",2.2169259719396095],["Canis familiaris are animals.","Dogs are common pets.",1.859694788966317],["Dogs are animals.","They are common pets.",1.4537090848972198],["I have a hammer.","Take some nails.",1.1578027104196844],["I have a hammer.","Take some apples.",1.3028564676146912],["I have a pen.","Where is ink?",1.020277185488236],["I have a pen.","Where do you live?",1.3924941078355293],["I like that bachelor.","I like that unmarried man.",1.176742725809037],["It is a dog.","That must be your dog.",0.0],["It is a dog.","It is a pig.",1.04864558369858],["It is a dog.","It is a log.",1.3798001799052624],["John is very nice.","Is John very nice?",0.0],["Red alcoholic drink.","Fresh orange juice.",3.1161814560971166],["Red alcoholic drink.","A bottle of wine.",3.386809492524872],["Red alcoholic drink.","Fresh apple juice.",3.505168296314785],["Red alcoholic drink.","An English dictionary.",4.106139922327307]],"arguments":{},"addedWidgets":{},"removedWidgets":[],"schema":[{"name":"s1","type":"string"},{"name":"s2","type":"string"},{"name":"wmd","type":"double"}],"overflow":false,"aggData":[],"aggSchema":[],"aggOverflow":false,"aggSeriesLimitReached":false,"aggError":"","aggType":"","plotOptions":null,"isJsonSchema":false,"dbfsResultPath":null},"errorSummary":"Command skipped","error":null,"workflows":[],"startTime":1.44341347197E12,"submitTime":1.443413386931E12,"finishTime":1.443413473371E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"sujit.pal@elsevier.com","commandTitle":null,"showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6c994b0d-c74e-4dfe-b8ba-d8ede797aeda"}],"dashboards":[],"guid":"289c401a-fa70-42cf-a2ee-d6e52c887490","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script>
<script
 src="https://databricks-prod-cloudfront.cloud.databricks.com/static/201604171002070700-c4c797e204f33fa939fad994617c60c2ea182ad5/js/notebook-main.js"
 onerror="window.mainJsLoadError = true;"></script>
</head>
<body>
  <script>
if (window.mainJsLoadError) {
  var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/201604171002070700-c4c797e204f33fa939fad994617c60c2ea182ad5/js/notebook-main.js';
  var b = document.getElementsByTagName('body')[0];
  var c = document.createElement('div');
  c.innerHTML = ('<h1>Network Error</h1>' +
    '<p><b>Please check your network connection and try again.</b></p>' +
    '<p>Could not load a required resource: ' + u + '</p>');
  c.style.margin = '30px';
  c.style.padding = '20px 50px';
  c.style.backgroundColor = '#f5f5f5';
  c.style.borderRadius = '5px';
  b.appendChild(c);
}
</script>
</body>
</html>