Skip to content

Instantly share code, notes, and snippets.

@anshulkgupta93
Created January 13, 2016 10:44
Show Gist options
  • Save anshulkgupta93/a60b1b73ff2d17f997b4 to your computer and use it in GitHub Desktop.
Save anshulkgupta93/a60b1b73ff2d17f997b4 to your computer and use it in GitHub Desktop.
Data analysis with MongoDB
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/anshul/DecisionStats/python-codes\n"
]
}
],
"source": [
"cd /home/anshul/DecisionStats/python-codes/"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Import all Required Libraries"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import csv\n",
"import json\n",
"import pprint\n",
"import sys\n",
"sys.path.append(\"/usr/local/lib/python2.7/dist-packages\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"DATAFILE = 'BigDiamonds.csv'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### This function will read the CSV file and convert it in Dictionary form so that can be stored in MongoDB"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def process_file(filename):\n",
" data = []\n",
" with open(filename, \"r\") as f:\n",
" reader = csv.DictReader(f)\n",
" for line in reader:\n",
" line[\"\"]=int(line[\"\"])\n",
" if line[\"price\"]!=\"NA\":\n",
" line[\"price\"]=float(line[\"price\"])\n",
" line[\"carat\"]=float(line[\"carat\"])\n",
" if line[\"x\"]!=\"NA\":\n",
" line[\"x\"]=float(line[\"x\"])\n",
" if line[\"y\"]!=\"NA\": \n",
" line[\"y\"]=float(line[\"y\"])\n",
" if line[\"z\"]!=\"NA\": \n",
" line[\"z\"]=float(line[\"z\"])\n",
" line[\"depth\"]=float(line[\"depth\"])\n",
" data.append(line)\n",
" return data\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Function to store Documents in examples database and in collection BigDiamonds in MongoDB"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def insert_data(data, db):\n",
"\n",
" db.BigDiamonds.insert_many(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### This will store the csv data in json file and in MongoDB"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def csvToMongo():\n",
" data = process_file(DATAFILE)\n",
" \n",
" with open('BigDiamonds.json',\"w\") as f:\n",
" json.dump(data,f)\n",
" \n",
" \n",
" from pymongo import MongoClient\n",
" client = MongoClient(\"mongodb://localhost:27017\")\n",
" db = client.examples\n",
" insert_data(data, db)\n",
" print db.BigDiamonds.find_one()\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{u'': 1, u'cut': u'V.Good', u'color': u'K', u'clarity': u'I1', u'price': u'NA', u'measurements': u'3.96 x 3.95 x 2.52', u'carat': 0.25, u'depth': 63.7, u'cert': u'GIA', u'z': 2.52, u'table': u'59', u'y': 3.95, u'x': 3.96, u'_id': ObjectId('56962361ce42082cb0d619ef')}\n"
]
}
],
"source": [
"csvToMongo()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### To run aggregate queries"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def aggregate(db, pipeline):\n",
" return [doc for doc in db.BigDiamonds.aggregate(pipeline)]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pymongo import MongoClient\n",
"client = MongoClient('localhost:27017')\n",
"db = client.examples\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Viewing First 20 documents of the data in MongoDB"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{u'': 1,\n",
" u'_id': ObjectId('56962361ce42082cb0d619ef'),\n",
" u'carat': 0.25,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I1',\n",
" u'color': u'K',\n",
" u'cut': u'V.Good',\n",
" u'depth': 63.7,\n",
" u'measurements': u'3.96 x 3.95 x 2.52',\n",
" u'price': u'NA',\n",
" u'table': u'59',\n",
" u'x': 3.96,\n",
" u'y': 3.95,\n",
" u'z': 2.52}\n",
"{u'': 2,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f0'),\n",
" u'carat': 0.23,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I1',\n",
" u'color': u'G',\n",
" u'cut': u'Good',\n",
" u'depth': 58.1,\n",
" u'measurements': u'4.00 x 4.05 x 2.30',\n",
" u'price': u'NA',\n",
" u'table': u'61',\n",
" u'x': 4.0,\n",
" u'y': 4.05,\n",
" u'z': 2.3}\n",
"{u'': 3,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f1'),\n",
" u'carat': 0.34,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I2',\n",
" u'color': u'J',\n",
" u'cut': u'Good',\n",
" u'depth': 58.7,\n",
" u'measurements': u'4.56 x 4.53 x 2.67',\n",
" u'price': u'NA',\n",
" u'table': u'58',\n",
" u'x': 4.56,\n",
" u'y': 4.53,\n",
" u'z': 2.67}\n",
"{u'': 4,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f2'),\n",
" u'carat': 0.21,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I1',\n",
" u'color': u'D',\n",
" u'cut': u'V.Good',\n",
" u'depth': 60.6,\n",
" u'measurements': u'3.80 x 3.82 x 2.31',\n",
" u'price': u'NA',\n",
" u'table': u'60',\n",
" u'x': 3.8,\n",
" u'y': 3.82,\n",
" u'z': 2.31}\n",
"{u'': 5,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f3'),\n",
" u'carat': 0.31,\n",
" u'cert': u'EGL',\n",
" u'clarity': u'I1',\n",
" u'color': u'K',\n",
" u'cut': u'V.Good',\n",
" u'depth': 62.2,\n",
" u'measurements': u'4.35 x 4.26 x 2.68',\n",
" u'price': u'NA',\n",
" u'table': u'59',\n",
" u'x': 4.35,\n",
" u'y': 4.26,\n",
" u'z': 2.68}\n",
"{u'': 6,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f4'),\n",
" u'carat': 0.2,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'SI2',\n",
" u'color': u'G',\n",
" u'cut': u'Good',\n",
" u'depth': 64.4,\n",
" u'measurements': u'3.74 x 3.67 x 2.38',\n",
" u'price': u'NA',\n",
" u'table': u'60',\n",
" u'x': 3.74,\n",
" u'y': 3.67,\n",
" u'z': 2.38}\n",
"{u'': 7,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f5'),\n",
" u'carat': 0.2,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'SI2',\n",
" u'color': u'G',\n",
" u'cut': u'Good',\n",
" u'depth': 62.6,\n",
" u'measurements': u'3.72 x 3.65 x 2.31',\n",
" u'price': u'NA',\n",
" u'table': u'63',\n",
" u'x': 3.72,\n",
" u'y': 3.65,\n",
" u'z': 2.31}\n",
"{u'': 8,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f6'),\n",
" u'carat': 0.22,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I1',\n",
" u'color': u'D',\n",
" u'cut': u'V.Good',\n",
" u'depth': 59.2,\n",
" u'measurements': u'3.95 x 3.97 x 2.34',\n",
" u'price': u'NA',\n",
" u'table': u'61',\n",
" u'x': 3.95,\n",
" u'y': 3.97,\n",
" u'z': 2.34}\n",
"{u'': 9,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f7'),\n",
" u'carat': 0.23,\n",
" u'cert': u'IGI',\n",
" u'clarity': u'SI2',\n",
" u'color': u'K',\n",
" u'cut': u'V.Good',\n",
" u'depth': 63.6,\n",
" u'measurements': u'3.87 x 3.90 x 2.47',\n",
" u'price': u'NA',\n",
" u'table': u'57.5',\n",
" u'x': 3.87,\n",
" u'y': 3.9,\n",
" u'z': 2.47}\n",
"{u'': 10,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f8'),\n",
" u'carat': 0.2,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'SI1',\n",
" u'color': u'F',\n",
" u'cut': u'Good',\n",
" u'depth': 54.9,\n",
" u'measurements': u'3.83 x 4.00 x 2.14',\n",
" u'price': u'NA',\n",
" u'table': u'65',\n",
" u'x': 3.83,\n",
" u'y': 4.0,\n",
" u'z': 2.14}\n",
"{u'': 11,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f9'),\n",
" u'carat': 0.24,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I1',\n",
" u'color': u'F',\n",
" u'cut': u'V.Good',\n",
" u'depth': 62.5,\n",
" u'measurements': u'3.96 x 3.90 x 2.46',\n",
" u'price': u'NA',\n",
" u'table': u'58',\n",
" u'x': 3.96,\n",
" u'y': 3.9,\n",
" u'z': 2.46}\n",
"{u'': 12,\n",
" u'_id': ObjectId('56962361ce42082cb0d619fa'),\n",
" u'carat': 0.2,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'SI2',\n",
" u'color': u'G',\n",
" u'cut': u'V.Good',\n",
" u'depth': 60.2,\n",
" u'measurements': u'3.76 x 3.79 x 2.27',\n",
" u'price': u'NA',\n",
" u'table': u'61',\n",
" u'x': 3.76,\n",
" u'y': 3.79,\n",
" u'z': 2.27}\n",
"{u'': 13,\n",
" u'_id': ObjectId('56962361ce42082cb0d619fb'),\n",
" u'carat': 0.22,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'SI2',\n",
" u'color': u'D',\n",
" u'cut': u'V.Good',\n",
" u'depth': 59.7,\n",
" u'measurements': u'3.94 x 3.93 x 2.35',\n",
" u'price': u'NA',\n",
" u'table': u'57',\n",
" u'x': 3.94,\n",
" u'y': 3.93,\n",
" u'z': 2.35}\n",
"{u'': 14,\n",
" u'_id': ObjectId('56962361ce42082cb0d619fc'),\n",
" u'carat': 0.23,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'SI2',\n",
" u'color': u'G',\n",
" u'cut': u'Good',\n",
" u'depth': 55.5,\n",
" u'measurements': u'4.12 x 4.04 x 2.27',\n",
" u'price': u'NA',\n",
" u'table': u'62',\n",
" u'x': 4.12,\n",
" u'y': 4.04,\n",
" u'z': 2.27}\n",
"{u'': 15,\n",
" u'_id': ObjectId('56962361ce42082cb0d619fd'),\n",
" u'carat': 0.23,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I1',\n",
" u'color': u'F',\n",
" u'cut': u'V.Good',\n",
" u'depth': 62.5,\n",
" u'measurements': u'3.91 x 3.93 x 2.45',\n",
" u'price': u'NA',\n",
" u'table': u'59',\n",
" u'x': 3.91,\n",
" u'y': 3.93,\n",
" u'z': 2.45}\n",
"{u'': 16,\n",
" u'_id': ObjectId('56962361ce42082cb0d619fe'),\n",
" u'carat': 0.23,\n",
" u'cert': u'IGI',\n",
" u'clarity': u'SI2',\n",
" u'color': u'J',\n",
" u'cut': u'V.Good',\n",
" u'depth': 63.7,\n",
" u'measurements': u'3.88 x 3.94 x 2.49',\n",
" u'price': u'NA',\n",
" u'table': u'52.5',\n",
" u'x': 3.88,\n",
" u'y': 3.94,\n",
" u'z': 2.49}\n",
"{u'': 17,\n",
" u'_id': ObjectId('56962361ce42082cb0d619ff'),\n",
" u'carat': 0.2,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'VS2',\n",
" u'color': u'E',\n",
" u'cut': u'Good',\n",
" u'depth': 60.4,\n",
" u'measurements': u'3.81 x 3.74 x 2.28',\n",
" u'price': u'NA',\n",
" u'table': u'62',\n",
" u'x': 3.81,\n",
" u'y': 3.74,\n",
" u'z': 2.28}\n",
"{u'': 18,\n",
" u'_id': ObjectId('56962361ce42082cb0d61a00'),\n",
" u'carat': 0.29,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I1',\n",
" u'color': u'G',\n",
" u'cut': u'V.Good',\n",
" u'depth': 64.0,\n",
" u'measurements': u'4.17 x 4.19 x 2.67',\n",
" u'price': u'NA',\n",
" u'table': u'58',\n",
" u'x': 4.17,\n",
" u'y': 4.19,\n",
" u'z': 2.67}\n",
"{u'': 19,\n",
" u'_id': ObjectId('56962361ce42082cb0d61a01'),\n",
" u'carat': 0.31,\n",
" u'cert': u'EGL',\n",
" u'clarity': u'I1',\n",
" u'color': u'F',\n",
" u'cut': u'V.Good',\n",
" u'depth': 62.8,\n",
" u'measurements': u'4.33 x 4.27 x 2.67',\n",
" u'price': u'NA',\n",
" u'table': u'59',\n",
" u'x': 4.33,\n",
" u'y': 4.27,\n",
" u'z': 2.67}\n",
"{u'': 20,\n",
" u'_id': ObjectId('56962361ce42082cb0d61a02'),\n",
" u'carat': 0.21,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'SI1',\n",
" u'color': u'E',\n",
" u'cut': u'Good',\n",
" u'depth': 62.5,\n",
" u'measurements': u'3.81 x 3.73 x 2.35',\n",
" u'price': u'NA',\n",
" u'table': u'66',\n",
" u'x': 3.81,\n",
" u'y': 3.73,\n",
" u'z': 2.35}\n"
]
}
],
"source": [
"count=0\n",
"for doc in db.BigDiamonds.find():\n",
" if count==20:\n",
" break\n",
" pprint.pprint(doc)\n",
" count+=1\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Number of Documents"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"598024"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.BigDiamonds.find().count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Selection Query "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{u'': 2,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f0'),\n",
" u'carat': 0.23,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I1',\n",
" u'color': u'G',\n",
" u'cut': u'Good',\n",
" u'depth': 58.1,\n",
" u'measurements': u'4.00 x 4.05 x 2.30',\n",
" u'price': u'NA',\n",
" u'table': u'61',\n",
" u'x': 4.0,\n",
" u'y': 4.05,\n",
" u'z': 2.3}\n",
"{u'': 3,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f1'),\n",
" u'carat': 0.34,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'I2',\n",
" u'color': u'J',\n",
" u'cut': u'Good',\n",
" u'depth': 58.7,\n",
" u'measurements': u'4.56 x 4.53 x 2.67',\n",
" u'price': u'NA',\n",
" u'table': u'58',\n",
" u'x': 4.56,\n",
" u'y': 4.53,\n",
" u'z': 2.67}\n",
"{u'': 6,\n",
" u'_id': ObjectId('56962361ce42082cb0d619f4'),\n",
" u'carat': 0.2,\n",
" u'cert': u'GIA',\n",
" u'clarity': u'SI2',\n",
" u'color': u'G',\n",
" u'cut': u'Good',\n",
" u'depth': 64.4,\n",
" u'measurements': u'3.74 x 3.67 x 2.38',\n",
" u'price': u'NA',\n",
" u'table': u'60',\n",
" u'x': 3.74,\n",
" u'y': 3.67,\n",
" u'z': 2.38}\n"
]
}
],
"source": [
"for doc in db.BigDiamonds.find({\"cut\":\"Good\"})[:3]:\n",
" pprint.pprint(doc)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Running aggregation queries"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{u'_id': u'null',\n",
" u'average_carat': 1.0722833163976402,\n",
" u'average_price': 8753.0179738863}]\n"
]
}
],
"source": [
"def make_pipeline():\n",
" # complete the aggregation pipeline\n",
" pipeline = [{\"$match\":{\"price\": {\"$ne\":\"NA\"}\n",
" }\n",
" }, \n",
" {\"$group\":{\"_id\":\"null\",\n",
" \"average_price\":{\"$avg\":\"$price\"},\n",
" \"average_carat\":{\"$avg\":\"$carat\"}\n",
" }\n",
" }\n",
" ]\n",
" return pipeline\n",
"pipeline = make_pipeline()\n",
"result = aggregate(db, pipeline)\n",
"pprint.pprint(result)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{u'_id': u'K', u'average_carat': 1.4950645585278706},\n",
" {u'_id': u'L', u'average_carat': 1.3632705053852925},\n",
" {u'_id': u'J', u'average_carat': 1.3475398796936426},\n",
" {u'_id': u'I', u'average_carat': 1.2712822628836897},\n",
" {u'_id': u'H', u'average_carat': 1.2099406596704285},\n",
" {u'_id': u'G', u'average_carat': 1.063840796640399},\n",
" {u'_id': u'F', u'average_carat': 0.9410531884197142},\n",
" {u'_id': u'E', u'average_carat': 0.8318823743354592},\n",
" {u'_id': u'D', u'average_carat': 0.8266182262664965}]\n"
]
}
],
"source": [
"def make_pipeline():\n",
" # complete the aggregation pipeline\n",
" pipeline = [{\"$group\":{\"_id\":\"$color\",\n",
" \"average_carat\":{\"$avg\":\"$carat\"},\n",
" }\n",
" },\n",
" {\"$sort\":{\"average_carat\":-1}\n",
" }\n",
" ]\n",
" return pipeline\n",
"pipeline = make_pipeline()\n",
"result = aggregate(db, pipeline)\n",
"pprint.pprint(result)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{u'_id': u'Ideal', u'average_carat': 1.120193559039977},\n",
" {u'_id': u'V.Good', u'average_carat': 1.0247596745924552},\n",
" {u'_id': u'Good', u'average_carat': 0.9003031166220351}]\n"
]
}
],
"source": [
"def make_pipeline():\n",
" # complete the aggregation pipeline\n",
" pipeline = [{\"$group\":{\"_id\":\"$cut\",\n",
" \"average_carat\":{\"$avg\":\"$carat\"},\n",
" }\n",
" },\n",
" {\"$sort\":{\"average_carat\":-1}\n",
" }\n",
" ]\n",
" return pipeline\n",
"pipeline = make_pipeline()\n",
"result = aggregate(db, pipeline)\n",
"pprint.pprint(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment