Skip to content

Instantly share code, notes, and snippets.

@ntfrnzn
Created December 8, 2020 23:53
Show Gist options
  • Save ntfrnzn/7c5befcde7d659988c03db9cc365f16c to your computer and use it in GitHub Desktop.
Save ntfrnzn/7c5befcde7d659988c03db9cc365f16c to your computer and use it in GitHub Desktop.
spark notebook example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"\n",
"def inside(p):\n",
" x, y = random.random(), random.random()\n",
" return x*x + y*y < 1\n",
"\n",
"NUM_SAMPLES=1000000\n",
"\n",
"count = sc.parallelize(range(0, NUM_SAMPLES)) \\\n",
" .filter(inside).count()\n",
"\n",
"print(\"Pi is roughly %f\" % (4.0 * count / NUM_SAMPLES))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.mllib.random import RandomRDDs\n",
"\n",
"# 1 million\n",
"NUM_SAMPLES = 1000000\n",
"\n",
"# Generates an RDD comprised of i.i.d. samples from the standard normal distribution,\n",
"# evenly distributed in 10 partitions.\n",
"normal_01 = RandomRDDs.normalRDD(sc, NUM_SAMPLES, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Apply a transformation\n",
"# Transform to different normal dist (different mean and variance)\n",
"mean = 20\n",
"stdev = 10\n",
"normal_2010 = normal_01.map(lambda x: mean + stdev * x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Count is an action - actually evaluates the lazy transformations we have done so far\n",
"normal_2010.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"normal_2010.sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# collect() processes all transformations and collects the results on the driver\n",
"# DANGER!!!\n",
"local_normal_2010 = normal_2010.collect()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"s = pd.Series(local_normal_2010)\n",
"ax = s.plot.kde()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sc.getConf().getAll()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Spark - Python (Kubernetes Mode)",
"language": "python",
"name": "spark_python_kubernetes"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment