Skip to content

Instantly share code, notes, and snippets.

@nicomak
Created November 13, 2015 03:08
Show Gist options
  • Save nicomak/3ce42dee154dc2ed7514 to your computer and use it in GitHub Desktop.
Save nicomak/3ce42dee154dc2ed7514 to your computer and use it in GitHub Desktop.
IPython Notebook with Word Count and SparkSQL Example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Execution took 0:21:45.194165\n"
]
}
],
"source": [
"import datetime\n",
"import re\n",
"from pyspark.sql.types import *\n",
"\n",
"start = datetime.datetime.now()\n",
"\n",
"def get_words(line):\n",
" words = re.sub(r'[^a-zA-Z]+', ' ', line).lower().split(' ')\n",
" return [word for word in words if len(word) < 50]\n",
"\n",
"# Distributed wordcount map/reduce\n",
"text_file = sc.textFile(\"hdfs://ubuntu0:9000/user/hduser/medium/input\")\n",
"counts = text_file.flatMap(lambda line: get_words(line)).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).cache()\n",
" \n",
"# Transform result RDD into Spark DataFrame\n",
"fields = [StructField('word', StringType(), True), StructField('count', IntegerType(), True)]\n",
"structure = StructType(fields)\n",
"schemaWordCounts = sqlContext.createDataFrame(counts, structure)\n",
"schemaWordCounts.registerTempTable(\"wordcounts\")\n",
"\n",
"print 'Execution took %s' % (datetime.datetime.now() - start)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Query took 0:00:05.771639\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2YAAADgCAYAAABo1HWmAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmYJWV1+PHvGZawLwk6KswwCkhEMCJGUERbTcyIOiZK\ncAggwbgmgDFqVGLCqPziglvQRyRGYCQuiCYKinHDVhREZUfRBxCcQZEY2YQBZDm/P6ru9J1Ld99m\nqLeq+/b38zz9TN/q6jp1e+6tW6fe856KzESSJEmS1J0FXe+AJEmSJM13JmaSJEmS1DETM0mSJEnq\nmImZJEmSJHXMxEySJEmSOmZiJkmSJEkday0xi4iTIuKGiLhsBuu+LyIuqr9+GhE3tbGPkiRJktSF\naOs+ZhGxH3Ab8PHM3OMB/N4RwOMz82XFdk6SJEmSOtTaiFlmngOsM/IVETtFxJcj4ocR8e2I2HWS\nX/0r4FOt7KQkSZIkdWDDjuP/O/DKzLwqIvYGPgw8q/fDiNgRWAKc3c3uSZIkSVJ5nSVmEbEF8GTg\n9IjoLd54YLXlwOnZVr2lJEmSJHWgyxGzBcDNmbnnNOu8GPjblvZHkiRJkjox7RyziNgkIs6PiIsj\n4vKIWDHFesdHxJURcUlETJdorZWZtwLXRMQB9TYiIh7Xt80/BLbNzO/N+NlIkiRJ0hw0bWKWmXcC\nz8jMxwOPB5bWc8HWioj9gZ0zcxfgFcAJk20rIj4FnAvsGhGrI+Jw4GDgbyLiYuByYFnfr7wYm35I\nkiRJmgeGljJm5pr6242BjYD7BlZZBqys1z0/IraJiIWZecPAdg6aIsRzpoj71mH7JkmSJEmjYGi7\n/IhYUI9o3QB8NTN/MLDK9sDqvsfXATs0t4uSJEmSNNpmMmJ2H/D4iNga+O+IeGxm/mhgtRj8tcHt\nRISdFSVJkiTNa5k5mDsBD+AG05l5C/BNYOnAj34BLOp7vEO9bLJtPOCvY445Zr1+b32/RjneKD83\n4xnPeN3FG+XnZjzjGa+7eKP83Iw3f+NNZ1hXxu0iYpv6+02BPwWuGFjtDOAl9Tr7ULXAvwFJkiRJ\n0owMK2V8OLAyIjagSuJOy8yzIuKVAJl5Yv14/4i4CrgdOLzsLkuSJEnSaJk2McvMy4AnTLL8xIHH\nRzS8X2uNjY2V2vS8izfKz814xjNed/FG+bkZz3jG6y7eKD834xlvMjGs1rGxQBHZVixJkiRJmm0i\ngpyi+cfQroySJEmS5reISXMJTeOBDkqZmEmSJEkayuq3mVufRHbG7fIlSZIkSWWYmEmSJElSx0zM\nJEmSJKljJmaSJEmS1LFZ0fzjwXR5cRKiJEmSpLluFo2Y5Xp8SZIkSepCRBT/mo2WLFnC2Wef3fh2\nZ8WImSRJkqS5qORgyexMzOqbRDe+3Vk0YiZJkiRJD8zq1at54QtfyEMf+lC22247jjzySDKTY489\nliVLlrBw4UIOO+wwbr31VgDGx8dZtGjROtvoHwVbsWIFBx54IIcddhhbbbUVu+++OxdccAEAhx56\nKKtWreL5z38+W265Je95z3saex4mZpIkSZLmpHvvvZfnPe95PPKRj+TnP/85v/zlL1m+fDknn3wy\nK1euZHx8nJ/97GfcdtttHHHEEVNuZ7Bs8swzz+Sggw7illtuYdmyZWt/99RTT2Xx4sV88Ytf5Le/\n/S2vf/3rG3suJmaSJEmS5qTvf//7XH/99Rx33HFsuummbLzxxuy777584hOf4HWvex1Llixh8803\n5x3veAef/vSnue+++2a03f3224+lS5cSERxyyCFccsklhZ+JiZkkSZKkOWr16tXsuOOOLFiwblpz\n/fXXs+OOO659vHjxYu655x5uuOGGGW134cKFa7/fbLPNuPPOO2ec1K0vEzNJkiRJc9KiRYtYtWoV\n99577zrLH/GIR3Dttdeufbxq1So23HBDFi5cyOabb86aNWvW/uzee+/l17/+9YxjluoWaWImSZIk\naU7ae++9efjDH86b3vQm1qxZw5133sl3v/tdDjroIN7//vdz7bXXctttt3H00UezfPlyFixYwKMf\n/WjuvPNOzjrrLO6++26OPfZY7rrrrhnHXLhwIVdffXXjz8XETJIkSdJ6ioJfwy1YsIAzzzyTq666\nisWLF7No0SJOP/10XvrSl3LooYfytKc9jUc96lFsttlmfPCDHwRg66235sMf/jAve9nL2GGHHdhi\niy3W6dI42T3U+h+/+c1v5thjj2Xbbbflfe973wP6a00nSvTgnzRQRE4Vq3qi67MfZe4hIEmSJGlC\nqXt3jaqp/l718kmzTkfMJEmSJKlj0yZmEbEoIr4ZET+KiMsj4qhJ1hmLiFsi4qL66y3ldleSJEmS\nRs+GQ35+N/DazLw4IrYALoiIr2XmFQPrfSszl5XZRUmSJEkabdOOmGXmrzLz4vr724ArgEdMsmqZ\nnpGSJEmSNA/MeI5ZRCwB9gTOH/hRAk+JiEsi4qyI2K253ZMkSZKk0TeslBGAuozxs8Br6pGzfhcC\nizJzTUQ8B/g88OjJtrNixYq134+NjTE2NrYeuyxJkiRJs9/4+Djj4+MzWndou/yI2Aj4IvDlzPzA\n0A1GXAPslZk3Diy3Xb4kSZI0Bw3e10vDPdB2+dOOmEX1P/Ax4MdTJWURsRD438zMiHgSVbJ342Tr\nSpIkSZp7HAwpb1gp477AIcClEXFRvexoYDFAZp4IHAC8OiLuAdYAywvtqyRJkiSNpKGljI0FspRR\nkiRJ0jw2XSnjjLsySpIkSZLKMDGTJEmSpI6ZmEmSJElSx0zMJEmSJKljJmaSJEmS1DETM0mSJEnq\nmImZJEmSJHXMxEySJEmSOmZiJkmSJEkdMzGTJEmSpI6ZmEmSJElSx0zMJEmSJKljJmaSJEmS1DET\nM0mSJEnqmImZJEmSJHXMxEySJEmSOmZiJkmSJEkdMzGTJEmSpI6ZmEmSJElSx6ZNzCJiUUR8MyJ+\nFBGXR8RRU6x3fERcGRGXRMSeZXZVkiRJkkbThkN+fjfw2sy8OCK2AC6IiK9l5hW9FSJif2DnzNwl\nIvYGTgD2KbfLkiRJkjRaph0xy8xfZebF9fe3AVcAjxhYbRmwsl7nfGCbiFhYYF8lSZIkaSTNeI5Z\nRCwB9gTOH/jR9sDqvsfXATs82B2TJEmSpPliWCkjAHUZ42eB19QjZ/dbZeBxTradFStWrP1+bGyM\nsbGxGe2kJEmSJM014+PjjI+Pz2jdyJw0h5pYIWIj4IvAlzPzA5P8/CPAeGZ+un78E+DpmXnDwHo5\nVayIYIpcbtjuM2z/JUmSJGk2iAgyc3BQCxjelTGAjwE/niwpq50BvKRefx/g5sGkTJIkSZI0tWlH\nzCLiqcC3gUuZGNI6GlgMkJkn1ut9CFgK3A4cnpkXTrItR8wkSZIkzVvTjZgNLWVscCdMzCRJkiTN\nW+tdyihJkiRJKs/ETJIkSZI6ZmImSZIkSR0zMZMkSZKkjpmYSZIkSVLHTMwkSZIkqWMmZpIkSZLU\nMRMzSZIkSeqYiZkkSZIkdczETJIkSZI6ZmImSZIkSR0zMZMkSZKkjpmYSZIkSVLHTMwkSZIkqWMm\nZpIkSZLUMRMzSZIkSeqYiZkkSZIkdczETJIkSZI6ZmImSZIkSR0bmphFxEkRcUNEXDbFz8ci4paI\nuKj+ekvzuylJkiRJo2vDGaxzMvBB4OPTrPOtzFzWzC5JkiRJ0vwydMQsM88BbhqyWjSzO5IkSZI0\n/zQxxyyBp0TEJRFxVkTs1sA2JUmSJGnemEkp4zAXAosyc01EPAf4PPDoyVZcsWLF2u/HxsYYGxtr\nILwkSZIkzT7j4+OMj4/PaN3IzOErRSwBzszMPWaw7jXAXpl548DynCpWRFANvD1QwUz2X5IkSZK6\nFhFk5qTTwB50KWNELIwqsyIinkSV7N045NckSZIkSbWhpYwR8Sng6cB2EbEaOAbYCCAzTwQOAF4d\nEfcAa4Dl5XZXkiRJkkbPjEoZGwlkKaMkSZKkeaxoKaMkSZIk6cFpoivjnFNPiVsvjtBJkiRJatq8\nTMwq61c6KUmSJElNs5RRkiRJkjpmYiZJkiRJHTMxkyRJkqSOmZhJkiRJUsdMzCRJkiSpYyZmkiRJ\nktQxEzNJkiRJ6piJmSRJkiR1zMRMkiRJkjpmYiZJkiRJHTMxkyRJkqSOmZhJkiRJUsdMzCRJkiSp\nYyZmkiRJktQxEzNJkiRJ6piJmSRJkiR1bGhiFhEnRcQNEXHZNOscHxFXRsQlEbFns7soSZIkSaNt\nJiNmJwNLp/phROwP7JyZuwCvAE5oaN8kSZIkaV4Ymphl5jnATdOssgxYWa97PrBNRCxsZvckSZIk\nafQ1Mcdse2B13+PrgB0a2K4kSZIkzQsbNrSdGHick620YsWKtd+PjY0xNjbWUHhJkiRJml3Gx8cZ\nHx+f0bqROWkOte5KEUuAMzNzj0l+9hFgPDM/XT/+CfD0zLxhYL2cKlZEMEUuN2zPmMn+dx1PkiRJ\nkiKCzBwc1AKaKWU8A3hJHWgf4ObBpEySJEmSNLWhpYwR8Sng6cB2EbEaOAbYCCAzT8zMsyJi/4i4\nCrgdOLzkDkuSJEnSqJlRKWMjgSxllCRJkjSPlS5llCRJkiQ9CCZmkiRJktQxEzNJkiRJ6piJmSRJ\nkiR1zMRMkiRJkjpmYiZJkiRJHTMxkyRJkqSOmZhJkiRJUsdMzCRJkiSpYyZmkiRJktQxEzNJkiRJ\n6piJmSRJkiR1zMRMkiRJkjpmYiZJkiRJHTMxkyRJkqSOmZhJkiRJUsdMzCRJkiSpYyZmkiRJktQx\nEzNJkiRJ6tjQxCwilkbETyLiyoh44yQ/H4uIWyLiovrrLWV2VZIkSZJG04bT/TAiNgA+BPwJ8Avg\nBxFxRmZeMbDqtzJzWaF9lCRJkqSRNmzE7EnAVZl5bWbeDXwaeMEk60XjeyZJkiRJ88SwxGx7YHXf\n4+vqZf0SeEpEXBIRZ0XEbk3uoCRJkiSNumlLGamSrmEuBBZl5pqIeA7weeDRk624YsWKtd+PjY0x\nNjY2s72c4yLWf0Axcyb/BZIkSZJmm/HxccbHx2e0bkx34h8R+wArMnNp/fjNwH2Z+a5pfucaYK/M\nvHFgeU4Vq0pc1icBifVKXEY9niRJkqTZJyLIzElHbYaVMv4Q2CUilkTExsCLgTMGNr4w6iGhiHgS\nVbJ34/03JUmSJEmazLSljJl5T0QcAXwF2AD4WGZeERGvrH9+InAA8OqIuAdYAywvvM+SJEmSNFKm\nLWVsNJCljC3He+Asm5QkSZLKma6UcVjzD81ZDzTJ8o4HkiRJUleGzTGTJEmSJBVmYiZJkiRJHbOU\nUQ9K2/do855wkiRJGkUmZmrA+jU2mQvxTAQlSZLUBhMzaai2E09JkiTNN84xkyRJkqSOOWImzSKW\nTkqSJM1PJmbSrGPppCRJ0nxjYibNY3bVlCRJmh1MzKR5z66akzERlCRJbTIxkzTiRjcRnCvxTHIl\nSRrOxEySGjW6I5DrF29uJLmSJHXNxEySNEs52mk8SZo/TMwkSQJm/+ij8ab8LRNPSSPAxEySJI0A\nE8/JzIXE0/mrUsXETJIkadYb3cRz/eLNjSRXeiBMzCRJkjSPONppvNkZz8RMkiRJKma2jz4ab7bE\nWzB0sxFLI+InEXFlRLxxinWOr39+SUTsuV57MqXxZjc3r+O1Gct4xjPe/InXZizjGc948ydem7GM\nZ7zu402bmEXEBsCHgKXAbsBBEfGYgXX2B3bOzF2AVwAnNLuL481ubl7HazOW8YxnvPkTr81YxjOe\n8eZPvDZjGc943ccbNmL2JOCqzLw2M+8GPg28YGCdZcBKgMw8H9gmIhY2vqeSJEmSNKKGJWbbA6v7\nHl9XLxu2zg4PftckSZIkaX6I6TqGRMSLgKWZ+fL68SHA3pl5ZN86ZwLvzMzv1o+/DvxjZl44sC17\njEqSJEma1zJz0u4gw7oy/gJY1Pd4EdWI2HTr7FAvm9EOSJIkSdJ8N6yU8YfALhGxJCI2Bl4MnDGw\nzhnASwAiYh/g5sy8ofE9lSRJkqQRNe2IWWbeExFHAF8BNgA+lplXRMQr65+fmJlnRcT+EXEVcDtw\nePG9liRJkqQRMu0cM0mSJElSeUNvMK1mRcQmM1kmtW3UX5sR8dRJlu3bxb7MZRGxQUS8p+v90NwS\nEZtFxK4tx9wqIn6/99VmbGm+qj8jHhERi3tfXe/TXDLrRswi4mHA/wO2z8ylEbEb8OTM/FjDcW4D\npnrymZlbNRmvL+6FmfmEYcsainUc8HbgDuB/gD8CXpuZpzYdq4t4dcx9gSVMlOVmZn68QJwNgZWZ\neXDT254m5mEDixKgxPOr47Xy2uzwvXdRZu45bFmD8f4eOBm4FfgP4AnAmzLzKyXi1THbej98j+q4\n3NoHSET8Efd/bv9VKNauwIeBh2XmYyPiccCyzDy2ULwLgJOAT2bmTSVi1HG6eu8tA44Dfi8zl0TE\nnsBbM3NZoXivBN4K3AXcVy/OzHxUoXgLgIOBR2bm2+oT0Ydl5vcLxdsc+AdgcWa+PCJ2AXbNzC8W\nitfK+6H+Ox6QmZ9pcrtDYrZyztkXbxFwPNC7UPht4DWZOdhYr8mY2wKLqaYkATDYOb3BWEcCxwD/\nC9zbF2+PQvFeBLwTWAj0mgw2fiyrO9D3ZF+sXrzGjmXDujJ24RSqk5l/qh9fCXwGaPRNkplbAETE\nscAvgf+sf3Qw8IgmY9VxHl5vd7OIeALVf2oCWwGbNR2v9uzMfENE/AVwLfBC4BygVKLUaryI+E/g\nUcDF9B0AgMZPROv5ljtGxO9l5l1Nb38Kf8zESdQmwLOAC2n4+bX92uzgvfdk4CnAQyLiH5g4oG5J\n2aqBl2bmByLiz4DfBw6lei8USczafD/UMb4QEacDa+plJROlk4E9gB8xcaINUCQe8FHgDcBH6seX\nAZ8CiiRmwHKq+dk/iIgfUn0GfrXpxLf33uvACmBv4Jv1flwUEUWSpNobgN0z8/8Kxuj3YarX5TOB\ntwG31cueWCjeycAFVMc1qI6jnwWKJGa09H7IzPsi4o1U53xtOYUWzjn7nAx8AjiwfnxwvexPSwSL\niLcDfw38jHWPnc8oEQ/4e6qLBL8ptP1B7wael5lXFI7z3vrfvwAeRnXeEsBBQKMND2djYrZdZp4W\nEW8CyMy7I+KegvGWZebj+h6fEBGXAv/ccJxnU705tmfiPxjgt8DRDcfq6f3/Pg/4bGbeUvh+cm3H\n2wvYrcWr9tcA34mIM1j3ZPR9JYJl5hH9jyNiG+C0AqG6eG1Ce++9jamSsA3qf3tuBQ5oOFa/XgL4\nXODUzLw8ouhdQ9p8P2wC3Eh1ItqvVKK0N/DYFt/rm2Xm+b3/r8zMiLi7VLDMvBI4OiLeQnX8PAm4\nLyJOAv4tM29sIs6wcr6m4kzi7sy8eeD1f99UKzfgaqrKjbbsnZl7RsRFUP0dI2KjgvF2yswDI2J5\nHe/2wseWNt8PX4uI11N91t3eW1jwtdn2OedDMvPkvsenRMRrC8Z7MdXr5XcFY/RbRfXZ2pZftZCU\nkZnjABHx3szcq+9HZ9QVD42ZjYnZbRHxB70HdQv+WwrGu72+cfan6sfLqa52NSozVwIrI+KAzPxs\n09ufwpkR8RPgTuDVEfHQ+vtRiXc58HCqq4VtuLr+WgBswcTIUlvWAI9seqMdvTahvffet4BvRcTJ\nmfnzprc/jQsi4qtUo1hvjoitKHsy2tr7ITP/unSMAd8DdqMaMWvDryNi596DiDgAuL5kwLpU83Dg\nOcDngE9SlTudDTy+oTAXMk0pI9VrtYQfRcTBwIZ12d1RwLmFYgG8CTg3Is6nKmeEKp84qlC830XE\n2jKxiHgIZd/rd0XEpn3xdmLieZbQ5vthOdVr8e8Gljf+2Vdr+5zzNxFxKNX7O6ieb8mR3cuBbWl4\nVGca1wDfjIgvAb1ksNgFbOCHEXEa8PmBeKUuEm4WETtl5tUA9ch/o5VFs3GO2V7AB4HHUn0IP4Sq\n5viSQvEeCfwbEyUB36Wq9722RLw65vOoTjLWNlbIzLcVivUHVPeWu7euS98yM39VIlZb8fpqfbcA\n9gS+z7ofvkXmLfTF37IO9NvCcfprmhdQvWY+k5lvLBRvG6ra8KfVi8aBt2VmkQ+ptt97EfHNSRZn\nZg6O+jQVbwHV6/PqerTgD6jmMVzacJzW3w8dzMF6OtU9M29g3ef2uKl/60HF2wn4d6rX5k1UJxsH\nF3xtXkB1MvgfwOf6y6Uj4r8z8y8KxPx9YBfW/Rz6VtNx6libU5WKPbte9BXg7ZlZ5MJdRPyAqoz+\nMqoEKaheLysLxTuEqjRtL2Al1Uj8W0rNlYqIZ1P9PXcDvgbsC/x1Zk52jGsiXqvvhzZ1cM65pI63\nT73oXODIzFxVKN4TgS9QPbfi50kRsaIXo395Zr61ULxTpohX5NZdEbGU6r1wTb1oCfCKbHDu+KxL\nzADqEoBe96afZmaxEpK2RcSJwKZUJUAfBf4SOD8z/6ZQvD2Ax9QxizSPiIhnZeY36kmYvRdU/yTM\nRq9cRMRYb9t9cXqy4MnFHlTzdXpX134NHJaZlxeKN1Z/m8A9wKrMXF0iVh3vv6hOZFZS/V0PBR6X\nmS8sFbNN9QdUzybAi4B7MvMNDcd5TFb3exxsmtI7OWx00nUX74eI+Db1nJO6hCuAyzPzsU3HquNd\nDbyW6urv2pGIgonSBvXFpS2ABZlZtDSn/wpsGyLi5VSjVjtQzRfcBziv1EWKtkXBpj6TxFoAPJmq\ntPdZ9eJvlCqvquP9JfANJk7uz8/MX5eINxB7c6r3Q+MXJac4j1irxAhIPcp5FFWi9IdUx8+ftlj2\nV1xE/JhqbmD/sbPYeVJf3M0z8/bha849UXWr/kOq1+lPsuG+A7M1MevvLFa6E91DgZdz/25fLy0U\n77LM3CMiLs3Mx9Uf/P+Tmfdr5d1ArBXA06muBH2JqkTmO5nZ6LyaiHhrZh5TX7mY7IBa6srFuzPz\nHweWvavgiNJ5wNG9q5L1CfG/ZuZTpv3FBxfzYUw0Afl+Zv5vwViXZOYfDVvWYLxNgb/h/qPHRd57\nU+zDDzLzjxve5kez6pQ2zuTvhyKTruuSiusz84768aZUI1rXTP+b6xXrh5n5xP4T4Ii4ODObKrkb\njHdeZj65xLaniLeKqrPsacDZWfiDMtrvDHc51XHlvMx8fEQ8hupY1vjIXB1vV+D13P9zttRo9b8C\nP6caZV170pSF5imVfO1PEe+CXHeeS+l4vQtZS6jm6vYuMjVW6dPheUTjnwFD4rV9ztn283sK1cj/\nlpm5qC7RfmVm/m2heG11DB28cLB28AGavXAw6+aYRbudxaAa4v02VTnA2qsJhWLBxITkNRGxPfAb\nqg4vJRxA1bL+wsw8PCIWUnUDalRmHlN/+yomDt5tvLYm62K0P1AkMaOaAL22VCQzx+sriEVExIFU\nLaZ7V7Y+FBFvyMzTC4W8IyL2y8xz6vhPZaLJSQmnAlcAS6laWx9SPy4i1m18sICqY1rj7cEz8+X1\nv2NNb3uI06mu3PfcR9VdrMSHcttzsC6KiE8CZ9LOPILHUDXhOAI4qS4XPa333ijgFNrtDHdnZt4R\nEUTEJvUIb8l7jJ0OnEB1wtb7XC/5OftX9fbfNLC81Dylr9fvgc+VTuJrbTfI+AJwM1UnyCLlp73z\niGx//up3IuJDTPwti1Q29Gn7nPOciHgH979IUer5fYDqM/0LdZxL6lL0UtrqoPs0qlHq5zP5/9fo\nJma032lv01IjLFM4M6p7ShxHdZCD6oVVwh11Oc49EbE11X0lFhWKBS0cvAEi4tXA3wI7RcRlfT/a\nkmqeUinXRMQ/UyUUQdXm9mcF470F+OPeKFlUE8q/QXWSU8KrqJqAbFM/vpGqW2MpO2fmARHxgsxc\nWZ94f6dgvP7GB/dQ3dKhSAlxT331cAl9x9pSo//ABv0lOJl5V0RsXCjWEcCJwK4R8Uuq98EhhWJB\nNbn6d0zMUeopkpjVJTinAafVx+vjqeZcbjDd7z0IbXeGW10/r89TneTfRPV+KOXuzDyh4PbXkZlL\n2opVexXVfcXujYjeZ19mofvC0X6DjO0z888KbXsdbY8eU83LTarbHPQr1U6+7XPOJ1A9v30Glpd6\nfmTmqli3S2jJY1krHUPbvHAwGxOztjvtfTEinpuZX2ojWGa+vf72c1F1rdkkM28uFO4H9YfvR4Ef\nUl0NKtkJq62D9yeBL1PdVPCNTAwp/zYL3DsjIk7NzEOprnI9komTwXOAkmV3QTWPrec33H8OUZN+\nQnVPkJ2AbaiaEbwAKDIJmomRj1vq+Xu/opp4XcpuVAn9U6muVH6H6n1RRAej//9XJ7lfqOO/gHLd\nvn5BNcLzTap7tN0KvIT7n9w0ooOr6L1S5RdTXf39ARP3HSqh1c5wfSWLK+qS262oSjcbVY9SB9UF\nyb+jOnYWKy3sYp5Svd0tYpJmKqV0kHieGxGPy4YbF03hFFocPe6gsqHtc86xNuL0WVVPR6K+MHgU\nBSth6KaDbtEGfrNmjll01GkvIm5j4mpsL8sueaWrfw5d/13YGz9Zq08Mv0V1AnoHsFXJA2tE/Dvw\nodIH74jYKjNvrU9kJvvwbfrD/sfAn1CduIyxbnKUBectHEdVitprq/ti4NLBeXUNxvsKEyOeaxOJ\nzHzvlL/04OK9nKot+B5UH8RbAP+SmR+Z9hfXP97pVAlE78aQfwVsnZl/WSjeFbQ4+l9/OH2CiZt0\nXwccmplXFYjV9mtlEdWoVW8u7repOnheVyjetVQJ9WnAmZnZ+G0cBuK12hmuLfXfccr2/JnZaHv+\nDucptdpMpS6h/wdgcVbzWXehuqlvkRtM18eynak60RXtitrB/NVWRujqc83ea3JzWjrn7GD+6kOo\nui3/CdXn7FeBo0pcNK/jtd1Bt3gDv9mUmI3V376bql60/+T33Zn5pIKx22wbPOlV9Mw8skCsZwL7\nUZ3M7ExVynVOZn6g4Ti9csINqP6ORQ/eEfGlzHxuREza1CAzGy3niIijgFdT/b8NjuQ2fnIxEPc6\nJk5Gz8kAsUYcAAAHdElEQVTM/y4Rq453eWbuXmr7k8Trn1DeX+pXqq3ujzNzt2HLGox3OlXy0Nbo\nfy/uFgAlk4kOXitfp0o6/7NedDDVh+9k80ybiLd1FrpNxDQxR7kb8YFUTa5ujYh/obr4emxmNnpj\n1r54bR9b2m6m8hmqiyIvyarhwebAuVmuUdOOVPfC2q9edA5wUxa4L2Q9gvsi4OtZdXzdB3hXZhaZ\npxQR/0M9QpdVQ7aNgItKHd8i4hNUF8zPyRZujNz28+tK/R7YIMt30C3ewG/WlDLmxF21NxpMiqLv\nRopNm+pKF1U2XEJrc+gy8+yo2lo/ker5vArYnWpyZpOe3/D2ppWZz62/PZcWDnCZeTxwfER8JDNf\nVSrOJBZSvTYvAk6iQKnRgDbLVaClOYl9LoyIJ2fmebC2XKzxE8OB0f8fR0TR0f+IODQzT42I19E3\nShBV0X1mmRt7tv1aeUhmntz3+JSIeG3BeL+LiCOoylV6nz+ZZTuGPomJROIJEVFyPmLb/jkzPxNV\nQ6FnUs2x/jCwd6F4bR9b2m6mslNmHhgRy6GaEzkwp6dpf07VSbBXCnoq1WjB8QVivY6qyc+jIuJc\n6tHjAnF62p7f+TGqi63H16M9F1HggnmfVp9ftN91su05icUb+M2axCy6a+jwGiaudD2jd6WrYLzW\n5tBFxDeohszPoypnfGIWaLdeash4Blo9wLWclJGZ/xRVs5FnUzXh+GB9pfRjWeaeR/sBh9cjkcVv\n4ktLcxL7jicbAt+NiNVUCcxi4KcFQvbK+d5NNUdvndH/AvE2q//dknXLt4JJyrkejIHR8TZfK7+J\niEOZKOtdTrn5c9B+x9C25yO2rfecngd8NDO/FBFFbkZea61ZRa3tZip3RUTvfd8r52r0XkoDXgbs\nnfV9qSLiXcD3KJCYZeYFUXXxa2v0uO35nW1dMO9p9fnRftfJU2i3o+1gA7+k6jbbmFmTmNFyQ4c+\nrVzpavsqeu1Sqjf/7lRza26K6n5Ad0z/a3NDBwe41mXmfRHxK+AGqpObbYHPRsTXs+EbI1Pd565N\nbY26TDei2/gHRtuj/5l5Yv3viqa3PYlWR8f7HA58COiN/p1bLyul7Y6hbXcjbtsvopqD/KfAO+tS\nwwUF47U6opstNVPpcwzV+dIO9WtzX8p20IW+G7sPfF9Cm6PHrY7QtXXBvE/bI5Btd51se8Tzp8C9\nmfm5iHgsVVl2o1NMZk1iVtfz30J1JbRNbV3pavsqOpn5WoCI2JLqoH0y1ZDr75WI17YODnCtiojX\nUHW6+w3VFZnX1wedBVRXhRpNzNoa+Wx71KXtEd2uRv/bKCHpYnQ8Ijakmq/TZlLYdsfQtrsRt+1A\nqtHH4zLz5oh4OA0fv6DTEd21ehdmCjsM+BLwWao53UdlZskR5JOB8yPiv6jOXf6cqry+cW2PHncw\nQtfqBfMOnl+rXSepRgS36z1oYURwsrLsE2iwLHvWNP+YDaJqQLIV1US+3w1ZfX1jrO001Lfssszc\no0CsI6nK0/aiOnifQ1Xqd3bTsboQEe+nOsDdycR8s5EZEYyItwInTTbBOiJ2y8wfd7BbD1pELJnu\n5x2WxjYiqnsGbkvLo/8RcR5VCckF9JWQZObnSsVsS0R8B3hWZpYs1+qP10rH0OioG/GoGvVjS0+0\n1NhrIOZedbysY11UKE6r3WzrmL1O2RtSV1GUnt/Zd8H89cDDMrPYBfM2nl9013VyL6qS2t1poaNt\n1B1CI+KdwGWZ+YnJzusfVAwTs3b0X0UH+ucHbQl8NzMPLhDzDVQnahcWvkLSqTYPcNJsFQVbSnct\nIj5O1YjjC8CaenGpxiatdfWLDrsRa26rR5L7y/jvyMySDUdaES13s51qhC4LdMqu47V6wbyD59d2\n18lNgSOAP6MagfwecHxmFmn6E9X9h39BVZa9J9XAwPnZYEdUE7OWdHUVfZSN+oig9EDUzRTOa7GE\npLiob+4eETcD7x/8edOJUl/ctu/T1lolhea+Scr4z5nrZfxdjR63PULX9gXzDp7fM6lGVvejGogo\n2pQt2r8/6eZUZdmXZuaVdVn2Hpn51cZimJhprpovI4LSdLoqIWlDTH9zd0pd1IqW7tPWRSWF5r5R\nLOPvavS47RG6tnXx/NoczY2W70/ahlnT/EN6oDLzuK73QepaZm4B7ZeQtOQjwDeoSnEG7zmX9fIS\n2urq11U3Ys1ho9jYq+1uth11ym5NV8+vg6ZsrdyftE2OmEnSCGi7hKRN0dLN3Qe6+u1CVSLdalc/\naZhRLONve/R41Od3djgC2cpobqx7f9JdgXXuT5qZj2kyXptMzCRpRIxqQ4C2zJeufprbRrGMv8Nu\ntiM9v7Or51e6KduQY3VO1s16rjAxk6QRMIoNASSphFGf39nV8xvF0dy2mZhJ0ggYxYYAklTCqHfK\n7nAEcuRGc9tmYiZJI8T7+kmSNDfZlVGSRsAkJSQnUZWRSJKkOcDETJJGwybAe7GERJKkOclSRkmS\nJEnq2IKud0CSJEmS5jsTM0mSJEnqmImZJEmSJHXMxEySJEmSOvb/AbYL7KazqqOlAAAAAElFTkSu\nQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7fb34249a5d0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"%matplotlib inline\n",
"start = datetime.datetime.now()\n",
"\n",
"topWords = sqlContext.sql(\"SELECT * FROM wordcounts WHERE length(word) >=3 and count > 100000 order by count DESC LIMIT 30\")\n",
"pandaData = topWords.toPandas()\n",
"pandaData.plot(kind='bar', figsize=(15,3))\n",
"plt.xticks(pandaData.index.values, pandaData['word'].values)\n",
" \n",
"print 'Query took %s' % (datetime.datetime.now() - start)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment