Skip to content

Instantly share code, notes, and snippets.

@saptak
Created February 18, 2016 01:56
Show Gist options
  • Save saptak/fac53e0aa2171e0d81a5 to your computer and use it in GitHub Desktop.
Save saptak/fac53e0aa2171e0d81a5 to your computer and use it in GitHub Desktop.
Week 4 solution
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flightRdd=sc.textFile(\"/tmp/flights.csv\") \\\n",
".map(lambda line: line.split(\",\"))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"carrierRdd = flightRdd.map(lambda line: (line[5],1))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u\"[(u'WN', 1)]\""
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"carrierRdd.take(1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cReducedRdd = carrierRdd.reduceByKey(lambda a,b: a+b)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"carriersSorted = cReducedRdd.map(lambda (a,b): (b,a)) \\\n",
".sortByKey(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u\"[(11807, u'WN'), (5819, u'AA'), (5550, u'OO')]\""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"carriersSorted.take(3)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"airportsRdd = sc.textFile(\"/tmp/airports.csv\") \\\n",
".map(lambda line: line.split(\",\"))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cityRdd = airportsRdd.map(lambda line: (line[0].strip('\"'), line[2].strip('\"')))\n",
"flightOrigDestRdd = flightRdd \\\n",
".map(lambda line: (line[12], line[13]))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u\"[(u'iata', u'city'), (u'ZZV', u'Zanesville'), (u'ZUN', u'Zuni')]\""
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cityRdd.top(3)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"origJoinRdd = flightOrigDestRdd.join(cityRdd)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"destAndOrigJoinRdd = origJoinRdd \\\n",
".map(lambda (a,b): (b[0],b[1])).join(cityRdd)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"citiesCleanRdd = destAndOrigJoinRdd.values()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u''"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"citiesReducedRdd = citiesCleanRdd \\\n",
".map(lambda line: (line,1)).reduceByKey(lambda a,b: a+b)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u\"[(164, (u'New York', u'Boston')), (162, (u'Boston', u'New York')), (150, (u'New York', u'Arlington')), (140, (u'Los Angeles', u'San Diego')), (137, (u'Los Angeles', u'San Francisco'))]\""
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"citiesReducedRdd.map(lambda (a,b): \\\n",
"(b,a)).sortByKey(ascending=False).take(5)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u\"[(u'AA', 953), (u'OO', 499), (u'DL', 455), (u'CO', 759), (u'UA', 960), (u'9E', 335), (u'AS', 418), (u'US', 757), (u'AQ', 300), (u'B6', 422)]\""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flightRdd.filter(lambda line: int(line[11]) > 15) \\\n",
".map(lambda line: (line[5], line[11])) \\\n",
".reduceByKey(lambda a,b: max(int(a),int(b))).take(10)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u\"[(950, u'A320-232'), (747, u'737-7H4')]\""
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"airplanesRdd = sc.textFile(\"/tmp/plane-data.csv\") \\\n",
".map(lambda line: line.split(\",\")) \\\n",
".filter(lambda line:len(line) == 9)\n",
"flight15Rdd = flightRdd \\\n",
".filter(lambda line: int(line[14]) > 1500) \\\n",
".map(lambda line: (line[7],1))\n",
"tailModelRdd = airplanesRdd \\\n",
".map(lambda line: (line[0],line[4]))\n",
"flight15Rdd.join(tailModelRdd) \\\n",
".map(lambda (a,b): (b[1],b[0])) \\\n",
".reduceByKey(lambda a,b: a+b) \\\n",
".map(lambda (a,b): (b,a)).sortByKey(ascending=False).take(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "PySpark",
"language": "",
"name": "pysparkkernel"
},
"language_info": {
"mimetype": "text/x-python",
"name": "pyspark"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment