Skip to content

Instantly share code, notes, and snippets.

@bhtucker
Created June 23, 2016 21:18
Show Gist options
  • Save bhtucker/927fc24e3f48c0224b90b2a72fb30b72 to your computer and use it in GitHub Desktop.
Save bhtucker/927fc24e3f48c0224b90b2a72fb30b72 to your computer and use it in GitHub Desktop.
PySpark Notebook for Spark Implementation of https://gist.github.com/bhtucker/5dccc1fa96d8030a752cb9f76cbaf558
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<pyspark.context.SparkContext at 0x10dc61690>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sc"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ok\n"
]
}
],
"source": [
"print(\"ok\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"keyed_rdd = sc.parallelize([\n",
" ('a', 2),\n",
" ('a', 2),\n",
" ('a', 2),\n",
" ('b', 2), \n",
" ('b', 2)\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('a', [2, 2, 2]), ('b', [2, 2])]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keyed_rdd.reduceByKey(lambda a, b: a + b).collect()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"keyed_rdd = sc.parallelize([\n",
" ('a', [2]),\n",
" ('a', [2]),\n",
" ('a', [2]),\n",
" ('b', [2]), \n",
" ('b', [2])\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"10"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reduce(lambda a, b: a + b, range(5))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"10"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(range(5))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('a', [2], 'a', [2], 'a', [2], 'b', [2], 'b', [2])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keyed_rdd.reduce(lambda a, b: a + b)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"phrase_rdd = sc.parallelize([\n",
" 'one whole sentence',\n",
" 'another sentence entirely'\n",
" ])\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"phrase_rdd.count()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[['one', 'whole', 'sentence'], ['another', 'sentence', 'entirely']]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"phrase_rdd.map(lambda v: v.split()).count()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[3, 3]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"phrase_rdd.map(lambda v: v.split()).map(len).collect()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['one', 'whole', 'sentence', 'another', 'sentence', 'entirely']"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"phrase_rdd.flatMap(lambda v: v.split()).collect()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"phrase_rdd.flatMap(lambda v: v.split()).count()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('whole', 1), ('entirely', 1), ('another', 1), ('one', 1), ('sentence', 2)]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_rdd = phrase_rdd.flatMap(lambda v: v.split())\n",
"\n",
"word_keyed_rdd = word_rdd.map(lambda v: (v, 1))\n",
"\n",
"word_keyed_rdd.reduceByKey(lambda a, b: a + b).collect()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {'another': 1, 'entirely': 1, 'one': 1, 'sentence': 2, 'whole': 1})"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_rdd.countByValue()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"puzzle_rdd = sc.parallelize(\n",
" [(0, 0)]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import operator\n",
"\n",
"def map_to_children(row, delta_values=[3, 5], delta_ops=[operator.add, operator.sub], current_level=None):\n",
" value, level = row\n",
" if current_level is not None and level != current_level:\n",
" return [row]\n",
" return [\n",
" (op(value, delta), level + 1)\n",
" for op in delta_ops\n",
" for delta in delta_values\n",
" if 0 <= op(value, delta) <= 8\n",
" ]\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 3), (1, 5)]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"map_to_children((0, 0))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0), (3, 1), (5, 1)]\n",
"[(0, 0), (2, 2), (3, 1), (5, 1), (6, 2), (8, 2)]\n",
"[(0, 0), (1, 3), (2, 2), (3, 1), (5, 1), (6, 2), (7, 3), (8, 2)]\n",
"[(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n",
"[(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n",
"[(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n"
]
}
],
"source": [
"from functools import partial\n",
"\n",
"puzzle_rdd = sc.parallelize(\n",
" [(0, 0)]\n",
")\n",
"\n",
"prior_count = -10\n",
"next_count = -5\n",
"level = 0\n",
"\n",
"while prior_count != next_count:\n",
" # propagate level and collect dupes\n",
" prior_level = puzzle_rdd\n",
" prior_count = prior_level.count()\n",
" next_level = puzzle_rdd.flatMap(lambda v: map_to_children(v, current_level=level)).cache()\n",
" next_count = next_level.count()\n",
" puzzle_rdd = prior_level.union(next_level)\n",
" puzzle_rdd = puzzle_rdd.reduceByKey(lambda a, b: min(a, b))\n",
" level += 1\n",
" print(puzzle_rdd.collect())\n",
"\n",
"# .flatMap(map_to_children).flatMap(map_to_children).flatMap(map_to_children).collect()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(1, 1)"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prior_count, next_count"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[[(1, 3), (1, 5)]]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Row reduction via only getting children for 'frontier' level:\n",
"\n",
"# 3\n",
"# [(0, 0), (3, 1), (5, 1)]\n",
"# 10\n",
"# [(0, 0), (2, 2), (3, 1), (5, 1), (6, 2), (8, 2)]\n",
"# 15\n",
"# [(0, 0), (1, 3), (2, 2), (3, 1), (5, 1), (6, 2), (7, 3), (8, 2)]\n",
"# 18\n",
"# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n",
"# 19\n",
"# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n",
"\n",
"\n",
"# 3\n",
"# [(0, 0), (3, 1), (5, 1)]\n",
"# 11\n",
"# [(0, 0), (2, 2), (3, 1), (5, 1), (6, 2), (8, 2)]\n",
"# 20\n",
"# [(0, 0), (1, 3), (2, 2), (3, 1), (5, 1), (6, 2), (7, 3), (8, 2)]\n",
"# 26\n",
"# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n",
"# 29\n",
"# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from functools import partial\n",
"def addx(a, x=4):\n",
" return a + x\n",
"\n",
"pseudopartial = lambda v: addx(v, x=8)\n",
"aa = pseudopartial(2)\n",
"realpartial = partial(addx, x=8)\n",
"bb = realpartial(2)\n",
"assert aa == bb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment