Skip to content

Instantly share code, notes, and snippets.

@s-j
Created October 1, 2018 12:12
Show Gist options
  • Save s-j/eabae3d46986f1b587fd6de1e3427d78 to your computer and use it in GitHub Desktop.
Save s-j/eabae3d46986f1b587fd6de1e3427d78 to your computer and use it in GitHub Desktop.
Test Jupyter notebook gist
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![title](https://databricks-training.s3.amazonaws.com/img/matrix_factorization.png)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pyspark.ml.feature import StringIndexer\n",
"from pyspark.ml.recommendation import ALS\n",
"from pyspark.ml.evaluation import RegressionEvaluator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"events = (sqlContext.read.csv('hdfs://hdfs-mesos/data.csv', sep=';', inferSchema=True)\n",
" .withColumnRenamed('_c0', 'time')\n",
" .withColumnRenamed('_c1', 'item')\n",
" .withColumnRenamed('_c2', 'user'))\n",
"events.take(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"user_items = events.groupBy('user', 'item').count().cache()\n",
"user_items.take(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"user_indexer = StringIndexer(inputCol=\"user\", outputCol=\"userIdx\")\n",
"user_items = user_indexer.fit(user_items).transform(user_items)\n",
"user_items.take(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"item_indexer = StringIndexer(inputCol=\"item\", outputCol=\"itemIdx\")\n",
"user_items = item_indexer.fit(user_items).transform(user_items)\n",
"user_items.take(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"(training, test) = user_items.randomSplit([0.8, 0.2])\n",
"training.take(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"als = ALS(maxIter=5, regParam=0.01, userCol=\"userIdx\", itemCol=\"itemIdx\", ratingCol=\"count\", implicitPrefs=True)\n",
"model = als.fit(training)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"predictions = model.transform(test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"predictions.select('user', 'item', 'count', 'prediction').take(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"evaluator = RegressionEvaluator(metricName=\"rmse\", labelCol=\"count\", predictionCol=\"prediction\")\n",
"rmse = evaluator.evaluate(predictions)\n",
"print(\"Root-mean-square error = \" + str(rmse))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"```¯\\_(ツ)_/¯```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"test.createOrReplaceTempView(\"test\")\n",
"training.select('userIdx').distinct().createOrReplaceTempView(\"model_users\")\n",
"training.select('itemIdx').distinct().createOrReplaceTempView(\"model_items\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"test2 = spark.sql(\"SELECT user, item, count, test.userIdx as userIdx, test.itemIdx as itemIdx \\\n",
" FROM test \\\n",
" JOIN model_users ON test.userIdx = model_users.userIdx \\\n",
" JOIN model_items ON test.itemIdx = model_items.itemIdx\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"test2.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"predictions2 = model.transform(test2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"rmse = evaluator.evaluate(predictions2)\n",
"print(\"Root-mean-square error = \" + str(rmse))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```(•̀ᴗ•́)و ̑̑```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment