Skip to content

Instantly share code, notes, and snippets.

@nicor88
Created May 31, 2017 16:17
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nicor88/7f935d1a7636121f94e83bc68a9c744b to your computer and use it in GitHub Desktop.
Save nicor88/7f935d1a7636121f94e83bc68a9c744b to your computer and use it in GitHub Desktop.
Jupyter Pyspark Examples
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+-------+---+------+-----+\n",
"| id| name|age|points|level|\n",
"+---+-------+---+------+-----+\n",
"| 1|Carleen| 24| 245| 5|\n",
"| 2| Steve| 31| 567| 7|\n",
"| 3| Ann| 41| 354| 5|\n",
"| 4| Lars| 30| 156| 3|\n",
"+---+-------+---+------+-----+\n",
"\n"
]
}
],
"source": [
"df = sqlCtx.createDataFrame([(1, 'Carleen', 24, 245, 5),\n",
" (2, 'Steve', 31, 567, 7),\n",
" (3, 'Ann', 41, 354, 5),\n",
" (4, 'Lars', 30, 156, 3)], ('id', 'name', 'age', 'points', 'level'))\n",
"df.show()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+-----+---+------+-----+\n",
"| id| name|age|points|level|\n",
"+---+-----+---+------+-----+\n",
"| 2|Steve| 31| 567| 7|\n",
"| 3| Ann| 41| 354| 5|\n",
"+---+-----+---+------+-----+\n",
"\n"
]
}
],
"source": [
"df.where(df['age'] > 30).show()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+------------------+-----+-----------------+-----------------+-----------------+\n",
"|summary| id| name| age| points| level|\n",
"+-------+------------------+-----+-----------------+-----------------+-----------------+\n",
"| count| 4| 4| 4| 4| 4|\n",
"| mean| 2.5| null| 31.5| 330.5| 5.0|\n",
"| stddev|1.2909944487358056| null|7.047458170621991|177.2427713617681|1.632993161855452|\n",
"| min| 1| Ann| 24| 156| 3|\n",
"| max| 4|Steve| 41| 567| 7|\n",
"+-------+------------------+-----+-----------------+-----------------+-----------------+\n",
"\n"
]
}
],
"source": [
"df.describe().show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read from S3"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = sqlContext.read.parquet('s3://us-east-1.elasticmapreduce.samples/flightdata/input')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df.select('flightdate', 'origin', 'dest', 'airtime', 'distance', 'cancelled', 'securitydelay')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+------+----+-------+--------+---------+-------------+\n",
"|flightdate|origin|dest|airtime|distance|cancelled|securitydelay|\n",
"+----------+------+----+-------+--------+---------+-------------+\n",
"|2007-01-01| MSP| DFW| 123| 852| 0| null|\n",
"|2007-01-01| DFW| MSP| 110| 852| 0| null|\n",
"|2007-01-01| ROC| MSP| 117| 783| 0| null|\n",
"|2007-01-01| MSP| OKC| 102| 695| 0| 0|\n",
"|2007-01-01| MSP| OKC| 101| 695| 0| null|\n",
"|2007-01-01| DTW| LNK| 105| 701| 0| null|\n",
"|2007-01-01| MEM| MSP| 105| 700| 0| 0|\n",
"|2007-01-01| MSP| MDT| 110| 898| 0| null|\n",
"|2007-01-01| MSP| AVL| 110| 861| 0| null|\n",
"|2007-01-01| AVL| MSP| 120| 861| 0| null|\n",
"|2007-01-01| DTW| XNA| 112| 716| 0| 0|\n",
"|2007-01-01| TUL| DTW| 107| 790| 0| 0|\n",
"|2007-01-01| DTW| TUL| 120| 790| 0| null|\n",
"|2007-01-01| AUS| DTW| 161| 1149| 0| null|\n",
"|2007-01-01| BGR| DTW| 132| 750| 0| 0|\n",
"|2007-01-01| BGR| DTW| 119| 750| 0| 0|\n",
"|2007-01-01| IND| FLL| 156| 1005| 0| 0|\n",
"|2007-01-01| FLL| IND| 138| 1005| 0| 0|\n",
"|2007-01-01| DSM| DCA| 112| 897| 0| null|\n",
"|2007-01-01| DCA| DSM| 141| 897| 0| 0|\n",
"+----------+------+----+-------+--------+---------+-------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.where(df['airtime'] > 100).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment