danielkraic/mongo-course-notes

## mongo-course-notes
db.data.drop()

db.data.insertOne({"name": "val1"})
db.data.insertOne({"_id": "id100", "name": "val1"})

db.data.insertMany([
        { "name": "val1" },
        { "name": "val2" }
    ]
)

db.data.insertMany(    [
      { "name": "val1" },
      { "name": "val2" }
    ],
    { "ordered": false}
)

_id, type of ObjectId() by default
12-byte hex string: 4: date + 3: mac addr + 2: process_id + 3: counter


db.data.find()
db.data.find().pretty()

db.data.find({"rated": "PG-13"})
db.data.find({"rated": "PG-13"}).count()
db.data.find({"rated": "PG-13", "year": 2009})

db.data.find({"tomato.meter": 100})

db.data.find({"writers": ["Eathan", "Joel"]}) // MATCH ENTIRE ARRAY
db.data.find({"actors": "Brad"}) // MATCH ANY ELEMENT OF ARRAY
db.data.find({"actors.0": "Brad"}) // MATCH FIRST ELEMENT OF ARRAY

// cursor
var c = db.data.find()
var doc = function() { return c.hasNext() ? c.next() : null; }
doc()

c.objsLeftInBadge()

// projection
db.data.find({"rated": "PG-13"}, { title: 1, _id: 0 })

// query operators

// comparsion $eq, $ne, $lt, $lte, $gt, $gte, $in, $nin
db.data.find({"rated": { $ne: "UNRATED" }})
db.data.find({"runtime": { $gt: 90 }})
db.data.find({"runtime": { $gte: 90, $lte: 120 }})
db.data.find({"tomato.meter": {$gt: 95}, "runtime": { $gte: 90, $lte: 120 }})
db.data.find({"rated": { $in: ["G", "PG"] }})

// element $exists, $type
db.data.find({"tomate.meter": { $exists: true }}).count()
db.data.find({"tomate.meter": { $exists: false }}).count()
.db.data.find({"_id": { $type: "string" }})

// logical operators $or $and $not $nor
db.data.find( { $or: [ {"tomato.eter": { $gte: 95}, { "metacritic": $gt: 88} ] } )
db.data.find({ $and: [ {"metacritic: {$not: null}}, {"metacritic": {$exists: true}} ] })

// regex
d.data.find({ "awards.text": { $regex: /^Won\s.*/ } })
d.data.find({ "awards.text": { $regex: '^Won\s.*' } })
d.data.find({ "awards.text": { $regex: '^Won\s.*', $options: "i" } })

// arays operator
{ "genre": { $all: ["Comedy", "Crime", "Drama"] } }
{ "country": { $size: 1 } }
{ "boxoffice": { $elemMatch: { "country": "UK", "revenue": { $gt: 15} } } }

db.movieDetails.find({'awards.oscars.award': 'bestPicture'})
db.movieDetails.updateMany({ year: {$gte: 2010, $lte: 2013},
                             "imdb.votes": {$lt: 10000},
                             $and: [{"tomato.consensus": {$exists: true} },
                                    {"tomato.consensus": null} ] },
                           { $unset: { "tomato.consensus": "" } });

// update

db.data.updateOne({"title": "The Martian"}, {$set: {"poster": "http://d.c"})
db.data.updateOne({"title": "The Martian"}, {$set: {"awards": {"wins": 0, "nominations": 2}}})

// update operators

// $set, $unset, $rename, $min, $max, $inc, $mul, ...
db.data.updateOne({"title": "Martian"}, {$inc: {"tomao.revies": 4, "tomato.userReviews: 3}})

// arrays operators: $addToSet $pop $pull $pullAll $push
db.data.updateOne(
    {"title": "Martian"},
    { $push : { "reviews":
                { "rating": 1, "author": "Someone", "text": ".."}
              }
    }
)
db.data.updateOne(
    {"title": "Martian"},
    { $push : { "reviews":
                { $each: [
                        { "rating": 1, "author": "Someone", "text": ".."},
                        { "rating": 2, "author": "Someone2", "text": ".."}
                       ]
                }
              }
    }
)

// update many
db.data.updateMany({"rated": null}, {$set: {"rated": "UNRATED"} })
db.data.updateMany({"rated": null}, {$unset: {"rated": ""} })

// upsert
db.data.update({...}, {...}, "upsert": true)

// replaceOne

db.data.replaceOne()

SORT -> SKIP -> LIMIT


HOMEWORK
1:
db.grades.find({'score': {$gte: 65}}, {'_id': 0, 'student_id': 1}).sort({'score': 1}).limit(1)

2:
```python
for doc in m.students.grades.find({'type': 'homework'}).sort([('student_id', pymongo.ASCENDING),('score', pymongo.ASCENDING)]):
     if last_student != doc['student_id']:
         counter += 1
         last_student = doc['student_id']
         g.remove({'_id': doc['_id']})
```

3:


ch4:

// hint to use index

db.students.find({student_id:{$gt:500000}, class_id:54}).sort({student_id:1}).hint({class_id:1}).explain("executionstats")

// profiling

db.getProfilingStatus()
db.getProfilingLevel()

db.setProfilingLevel(0)    // do not log queries
db.setProfilingLevel(1, 4) // log queries slower than 4s
db.setProfilingLevel(2)    // log all queries

db.system.profile.find()
db.system.profile.find().sort({ts:1})
db.system.profile.find({ns:/school.students/})
db.system.profile.find({millis: {$lt:1}}, {ts:1, millis:1}).sort({ts:-1})

mongotop
mongotop 5 // 5 seconds refresh

mongotop
mongostat


storage engines

mmap v1:
collection level locking
in place updates
power of two sizes allocations

wired tiger:
document level concurrency (no locking is needed)
compression (documents data, indexes)
append only (no inplace updates)

killall mongod
mkdir WT
mongod -dbpath WT -storageEngine wiredTiger
mongod --dbpath WT --storageEngine wiredTiger -wiredTigerIndexPrefixCompression true
db.foo.stats()

index: btree

db.students.createIndex({student_id:1})
db.students.createIndex({student_id:1, class_id:-1})
db.students.createIndex({location:'2d'})
db.students.createIndex({location:'2dsphere'})

// full text search
db.students.createIndex({location:'text'})
db.data.find({'$text': {'$search': 'dog cat'}}) //find dog or cat in document

db.students.getIndexes()
db.students.dropIndex({student_id:1})
// multikey indexes - on array

db.students.createIndex({student_id:1}, {'unique':true})
db.students.createIndex({student_id:1}, {'unique':true, 'sparse':true}) // sprase - do not use null values in index
db.students.createIndex({student_id:1}, {'background':true})

foreground index creation:
fast, block W/R to whole DB (all collections)
background index creation:
slow, no blocks

explain: find, update, remove, aggregate (wont work with insert)

db.students.explain().find({student_id: 5}) //COLLSCAN, docsExamined
db.students.explain(true).find({student_id: 5}) //COLLSCAN, docsExamined

var exp = db.students.explain()
exp.help()
exp.find({'name': 'noo'}).sort('name':1)

explain verbosity:
queryPlanner // default
executionStats
allPlansExecution

var exp = db.students.explain('executionStats')
var exp = db.students.explain('allPlansExecution')

covered query: query result is satisfied with index (no documents need to be inspected)
NO  db.ijk_data.find({'i':10, 'k':20})
NO  db.ijk_data.find({'i':10, 'k':20}, {'_id': 0})
YES db.ijk_data.find({'i':10, 'k':20}, {'_id': 0, 'i':1, 'j':1, 'k':1})

db.examples.stats()
db.examples.totalIndexSize()

db.students.find({'scores': { $elemMatch: {'type':'exam', 'score': {$gt: 98}} } })

db.stores.find({ loc:{ $near: { $geometry: { type: "Point", coordinates: [-130, 39]}, $maxDistance:1000000 } } })

for (i=0; i<10; i++) { db.ex.insert({'a': i*i}) }
for (i=0; i<10; i++) { x=[]; for (j=0; j<10; j++) { x.push({"key": i, "val": j}); }; db.ex.insert({'arr': x}); }


agregation pipeline:
$project (reshape doc)
$match
$group
$sort (100mb limit to memory based sort, or disk based sort)
$skip (after sort)
$limit (after sort)
$unwind (nornalize, explode data)
$out (redirect to coll)
$redact
$geoner

$group:
$sum $avg $min $max
$push $addtoset (for building arrays)
$first $last (only with $sort)

explain: db.zips.aggregate([...], {explain:true})
allowDiskUse (100mb limit for in-memory aggregation): db.zips.aggregate([...], {allowDiskUse:true})
cursor

db.products.aggregate([{$group:{_id:"$manufacturer",num_products:{$sum:1}}}])
db.products.aggregate([{$group:{_id:{"cat":"$category"},num_products:{$sum:1}}}])
db.products.aggregate([{$group:{_id:{"maker": "$manufacturer", "cat":"$category"},num_products:{$sum:1}}}])

db.products.aggregate([{$group:{_id:{"maker": "$manufacturer"}, sum_price:{$sum:"$price"}}}])
db.zips.aggregate([{$group:{ _id: "$state", "population": {$sum:"$pop"}}}])

db.zips.aggregate([{$group:{ _id: "$_id", "average_pop": {$avg:"$pop"}}}])
db.zips.aggregate([{"$group":{"_id":"$state", "average_pop":{"$avg":"$pop"}}}])

db.zips.aggregate([{$group: {_id: "$city", "postal_codes": {$addToSet: "$_id"}}}])
db.zips.aggregate([{$group: {_id: "$state", "pop": {$max: "$pop"}}}])

db.grades.aggregate([
    {'$group':{_id:{class_id:"$class_id", student_id:"$student_id"}, 'average':{"$avg":"$score"}}},
    {'$group':{_id:"$_id.class_id", 'average':{"$avg":"$average"}}}])

db.products.aggregate([
    {$project: {
	 _id:0,
	 'maker': {$toLower:"$manufacturer"},
	 'details': {'category': "$category",
		     'price' : {"$multiply":["$price",10]}
		    },
	 'item':'$name'
    }}
])

db.zips.aggregate([{$project: {_id:0, "city": {$toLower:"$city"}, "pop":1, "state":1, "zip":"$_id"}}])

db.zips.aggregate([
    {$match:
     {
	 state:"NY"
     }
    },
    {$group:
     {
	 _id: "$city",
	 population: {$sum:"$pop"},
	 zip_codes: {$addToSet: "$_id"}
     }
    },
    {$project:
     {
	 _id: 0,
	 city: "$_id",
	 population: 1,
	 zip_codes:1
     }
    }

])

db.zips.aggregate([{$match:{pop:{$gt:100000}}}])

db.zips.aggregate([
	{$match: {"state" : "NY"}},
	{$group: {_id: "$city", "pop": {$sum: "$pop"}}},
	{$project: {_id:0, "city": "$_id", pop:1}},
	{$sort: {pop:-1}} ])


db.zips.aggregate([
	{$match:{pop:{$gt:100000}}}
	{$skip: 4},
	{$limit: 10}
])

use agg
db.zips.aggregate([
    /* get the population of every city in every state */
    {$group:
     {
	 _id: {state:"$state", city:"$city"},
	 population: {$sum:"$pop"},
     }
    },
     /* sort by state, population */
    {$sort:
     {"_id.state":1, "population":-1}
    },
    /* group by state, get the first item in each group */
    {$group:
     {
	 _id:"$_id.state",
	 city: {$first: "$_id.city"},
	 population: {$first:"$population"}
     }
    }
])


db.posts.aggregate([
    /* unwind by tags */
    {"$unwind":"$tags"},
    /* now group by tags, counting each tag */
    {"$group":
     {"_id":"$tags",
      "count":{$sum:1}
     }
    },
    /* sort by popularity */
    {"$sort":{"count":-1}},
    /* show me the top 10 */
    {"$limit": 10},
    /* change the name of _id to be tag */
    {"$project":
     {_id:0,
      'tag':'$_id',
      'count' : 1
     }
    }
    ])


use agg;
db.inventory.drop();
db.inventory.insert({'name':"Polo Shirt", 'sizes':["Small", "Medium", "Large"], 'colors':['navy', 'white', 'orange', 'red']})
db.inventory.insert({'name':"T-Shirt", 'sizes':["Small", "Medium", "Large", "X-Large"], 'colors':['navy', "black",  'orange', 'red']})
db.inventory.insert({'name':"Chino Pants", 'sizes':["32x32", "31x30", "36x32"], 'colors':['navy', 'white', 'orange', 'violet']})
db.inventory.aggregate([
    {$unwind: "$sizes"},
    {$unwind: "$colors"},
    {$group:
     {
	'_id': {'size':'$sizes', 'color':'$colors'},
	'count' : {'$sum':1}
     }
    }
])

db.products.aggregate([
	{$group:{_id:"$manufacturer",num_products:{$sum:1}}} // output _id must be unique
	{$out: 'collection_name'} // will drop collection if exist !!!
])


// hw5.1: find author with most comments
db.posts.aggregate([
  {$project: {"_id":0, "authors":"$comments.author"}},
  {$unwind: "$authors"},
  {$group: {"_id": "$authors", 'sum': {$sum:1}}},
   {$sort: {"sum": -1}}
])

// hw5.2: find avg population in cities of CA and NY with population over 25000
db.zips.aggregate([
  {$match: {"state": {$in: ["CA", "NY"]} }},
  {$group: { _id:{ "state": "$state", "city": "$city"}, "pop": {$sum: "$pop"}  }},
  {$match: {"pop":{$gt: 25000}}},
  {$group: {"_id": null, "avg_pop": {$avg: "$pop"}}}
])

// hw5.3: class with highest avg students scores (except quizes)
db.grades.aggregate([
  {$project:{ _id: 0, "class_id": 1, "student_id":1, "scores": 1}},
  {$unwind: "$scores"},
  {$match: {"scores.type": {$ne: "quiz"}}},
  {$group: {"_id": {"class_id": "$class_id", "student_id": "$student_id"}, "avg_score": {$avg: "$scores.score"}}},
  {$group: {"_id": "$_id.class_id", "avg_scores": {$avg: "$avg_score"}}},
{$sort: {"avg_scores":-1}}
])

// hw5.4: find number of people that live in city that name start with number
db.zips.aggregate([
  {$match: {"city": {$regex: /^[0-9]/}}},
  {$group: {_id: null, "pop_sum": {$sum: "$pop"}}}
])


write concern:
w - 1 (how many nodes to wait for ack write)
j - false (do not wait for journal to save to disk)
wtimeout - ho long to wait for ack
( all of them may be set at connection, for collection, in rs config)

replica set:
1 node is primary (writes and reads), others nodes are secondary (only reads)
replication is asynchronous
if primary goes down, on of secondary nodes becomes primary (election)
minimal number of nodes in replica set: 3

types of nodes in replica set:
- regular (has data, can become primary)
- arbiter (only for voting purposes)
- delayed (disaster recovery node, set X hour behind other nodes, cannot become primary)
- hidden (eq. for analytics, cannot become primary)

strong consistency:
read and write only from primary nodes

eventual consistency:
write to primary, but read from secondary
you can read what you write, but there may be small delay


// create replica set
#!/usr/bin/env bash
mkdir -p /data/rs1 /data/rs2 /data/rs3
mongod --replSet m101 --logpath "1.log" --dbpath /data/rs1 --port 27017 --oplogSize 64 --fork --smallfiles
mongod --replSet m101 --logpath "2.log" --dbpath /data/rs2 --port 27018 --oplogSize 64 --smallfiles --fork
mongod --replSet m101 --logpath "3.log" --dbpath /data/rs3 --port 27019 --oplogSize 64 --smallfiles --fork


// init replica set
config = { _id: "m101", members:[
          { _id : 0, host : "localhost:27017"},
          { _id : 1, host : "localhost:27018"},
          { _id : 2, host : "localhost:27019"} ]
};
rs.initiate(config);

rs.status()
rs.config() / show rs config
rs.slaveOk() // allow reads from secondary node
rs.stepDown() // ??
rs.help()

each rs node has oplog (secondary is constantly reading oplog of primary)
use local
show collections
db.oplog.rs.find()

after failover - node will connect to primary and copy its entire dataset

proper handling of failover:

insert: catch AutoReconnect errors: if AutoReconnect occures, sleep X seconds and retry operation

    for retry in range (3):

        try:

            things.insert_one({'_id':i})

            print "Inserted Document: " + str(i)

            time.sleep(.1)

            break

        except pymongo.errors.AutoReconnect as e:

            print "Exception ",type(e), e

            print "Retrying.."

            time.sleep(5)

        except pymongo.errors.DuplicateKeyError as e:

            print "duplicate..but it worked"

            break


read: retry
update:
  - idempotent ($set): retry
  - non-idempotent ($inc, $push): ?? depends on application

read preference:
- Primary
- PrimaryPrefered
- Secondary (only secondary)
- SecondaryPrefered
- Nearest (closest pingtime)


sharding

mongos - router between sharded hosts
shared key:
- every doc must include shared key
- shared key is immutable
- shared key must be indexed (or non-multi-key index that starts with shared key)
- in update shared key must be specified or 'multi'
- all queries must include shared key, or query will be send to all shards
- unique key is not allowed (unless it is part of shared key)

types of sharding:
- ranged based (s1: 1..100, s2: 101..200, ...)
- hashed based (hashing over shared key)

sh.status()


# final exam

## Q1

```javascript
db.messages.find({'headers.From': 'andrew.fastow@enron.com', 'headers.To': 'jeff.skilling@enron.com'}, {'headers.From':1, 'headers.To':1})
```

## Q2

```javascript
// zle
//db.messages.aggregate( {$project: {'_id': 0, 'From':'$headers.From', 'To': '$headers.To'}}, {$unwind: '$To'}, {'$group': {'_id': {'From':'$From', 'To': '$To'}, 'sum': {$sum:1} }}, {$sort: {"sum": -1}} )

// dobre
db.messages.aggregate( {$project: {'From':'$headers.From', 'To': '$headers.To'}}, {$unwind: '$To'}, {$group: {_id: {'Msg': '$_id', 'From': '$From'}, 'To': {$addToSet: '$To'}}}, {$unwind: '$To'}, {'$group': {'_id': {'From':'$_id.From', 'To': '$To'}, 'sum': {$sum:1} }}, {$sort: {"sum": -1}} )
```

## Q3

```javascript
// get 'To' count
db.messages.aggregate( {$match: {'headers.Message-ID': '<8147308.1075851042335.JavaMail.evans@thyme>'}}, {$project: {'From': '$headers.From', 'To': '$headers.To'}}, {$unwind: '$To'}, {$group: {'_id':null, 'sum': {$sum: 1}}} )
// update
db.messages.update({'headers.Message-ID': '<8147308.1075851042335.JavaMail.evans@thyme>'}, {$push: {'headers.To': 'mrpotatohead@mongodb.com'}})
```

## Q4

```python
try:
  self.posts.update_one({'permalink': permalink}, {'$inc': {"comments.{ord}.num_likes".format(ord=comment_ordinal): 1}})
except Exception as err:
  print('Error INC likes. Error: {err}'.format(err=err))
```
	db.data.drop()

	db.data.insertOne({"name": "val1"})
	db.data.insertOne({"_id": "id100", "name": "val1"})

	db.data.insertMany([
	{ "name": "val1" },
	{ "name": "val2" }
	]
	)

	db.data.insertMany( [
	{ "name": "val1" },
	{ "name": "val2" }
	],
	{ "ordered": false}
	)

	_id, type of ObjectId() by default
	12-byte hex string: 4: date + 3: mac addr + 2: process_id + 3: counter


	db.data.find()
	db.data.find().pretty()

	db.data.find({"rated": "PG-13"})
	db.data.find({"rated": "PG-13"}).count()
	db.data.find({"rated": "PG-13", "year": 2009})

	db.data.find({"tomato.meter": 100})

	db.data.find({"writers": ["Eathan", "Joel"]}) // MATCH ENTIRE ARRAY
	db.data.find({"actors": "Brad"}) // MATCH ANY ELEMENT OF ARRAY
	db.data.find({"actors.0": "Brad"}) // MATCH FIRST ELEMENT OF ARRAY

	// cursor
	var c = db.data.find()
	var doc = function() { return c.hasNext() ? c.next() : null; }
	doc()

	c.objsLeftInBadge()

	// projection
	db.data.find({"rated": "PG-13"}, { title: 1, _id: 0 })

	// query operators

	// comparsion $eq, $ne, $lt, $lte, $gt, $gte, $in, $nin
	db.data.find({"rated": { $ne: "UNRATED" }})
	db.data.find({"runtime": { $gt: 90 }})
	db.data.find({"runtime": { $gte: 90, $lte: 120 }})
	db.data.find({"tomato.meter": {$gt: 95}, "runtime": { $gte: 90, $lte: 120 }})
	db.data.find({"rated": { $in: ["G", "PG"] }})

	// element $exists, $type
	db.data.find({"tomate.meter": { $exists: true }}).count()
	db.data.find({"tomate.meter": { $exists: false }}).count()
	.db.data.find({"_id": { $type: "string" }})

	// logical operators $or $and $not $nor
	db.data.find( { $or: [ {"tomato.eter": { $gte: 95}, { "metacritic": $gt: 88} ] } )
	db.data.find({ $and: [ {"metacritic: {$not: null}}, {"metacritic": {$exists: true}} ] })

	// regex
	d.data.find({ "awards.text": { $regex: /^Won\s.*/ } })
	d.data.find({ "awards.text": { $regex: '^Won\s.*' } })
	d.data.find({ "awards.text": { $regex: '^Won\s.*', $options: "i" } })

	// arays operator
	{ "genre": { $all: ["Comedy", "Crime", "Drama"] } }
	{ "country": { $size: 1 } }
	{ "boxoffice": { $elemMatch: { "country": "UK", "revenue": { $gt: 15} } } }

	db.movieDetails.find({'awards.oscars.award': 'bestPicture'})
	db.movieDetails.updateMany({ year: {$gte: 2010, $lte: 2013},
	"imdb.votes": {$lt: 10000},
	$and: [{"tomato.consensus": {$exists: true} },
	{"tomato.consensus": null} ] },
	{ $unset: { "tomato.consensus": "" } });

	// update

	db.data.updateOne({"title": "The Martian"}, {$set: {"poster": "http://d.c"})
	db.data.updateOne({"title": "The Martian"}, {$set: {"awards": {"wins": 0, "nominations": 2}}})

	// update operators

	// $set, $unset, $rename, $min, $max, $inc, $mul, ...
	db.data.updateOne({"title": "Martian"}, {$inc: {"tomao.revies": 4, "tomato.userReviews: 3}})

	// arrays operators: $addToSet $pop $pull $pullAll $push
	db.data.updateOne(
	{"title": "Martian"},
	{ $push : { "reviews":
	{ "rating": 1, "author": "Someone", "text": ".."}
	}
	}
	)
	db.data.updateOne(
	{"title": "Martian"},
	{ $push : { "reviews":
	{ $each: [
	{ "rating": 1, "author": "Someone", "text": ".."},
	{ "rating": 2, "author": "Someone2", "text": ".."}
	]
	}
	}
	}
	)

	// update many
	db.data.updateMany({"rated": null}, {$set: {"rated": "UNRATED"} })
	db.data.updateMany({"rated": null}, {$unset: {"rated": ""} })

	// upsert
	db.data.update({...}, {...}, "upsert": true)

	// replaceOne

	db.data.replaceOne()

	SORT -> SKIP -> LIMIT


	HOMEWORK
	1:
	db.grades.find({'score': {$gte: 65}}, {'_id': 0, 'student_id': 1}).sort({'score': 1}).limit(1)

	2:
	```python
	for doc in m.students.grades.find({'type': 'homework'}).sort([('student_id', pymongo.ASCENDING),('score', pymongo.ASCENDING)]):
	if last_student != doc['student_id']:
	counter += 1
	last_student = doc['student_id']
	g.remove({'_id': doc['_id']})
	```

	3:





	ch4:

	// hint to use index

	db.students.find({student_id:{$gt:500000}, class_id:54}).sort({student_id:1}).hint({class_id:1}).explain("executionstats")

	// profiling

	db.getProfilingStatus()
	db.getProfilingLevel()

	db.setProfilingLevel(0) // do not log queries
	db.setProfilingLevel(1, 4) // log queries slower than 4s
	db.setProfilingLevel(2) // log all queries

	db.system.profile.find()
	db.system.profile.find().sort({ts:1})
	db.system.profile.find({ns:/school.students/})
	db.system.profile.find({millis: {$lt:1}}, {ts:1, millis:1}).sort({ts:-1})

	mongotop
	mongotop 5 // 5 seconds refresh

	mongotop
	mongostat




	storage engines

	mmap v1:
	collection level locking
	in place updates
	power of two sizes allocations

	wired tiger:
	document level concurrency (no locking is needed)
	compression (documents data, indexes)
	append only (no inplace updates)

	killall mongod
	mkdir WT
	mongod -dbpath WT -storageEngine wiredTiger
	mongod --dbpath WT --storageEngine wiredTiger -wiredTigerIndexPrefixCompression true
	db.foo.stats()

	index: btree

	db.students.createIndex({student_id:1})
	db.students.createIndex({student_id:1, class_id:-1})
	db.students.createIndex({location:'2d'})
	db.students.createIndex({location:'2dsphere'})

	// full text search
	db.students.createIndex({location:'text'})
	db.data.find({'$text': {'$search': 'dog cat'}}) //find dog or cat in document

	db.students.getIndexes()
	db.students.dropIndex({student_id:1})
	// multikey indexes - on array

	db.students.createIndex({student_id:1}, {'unique':true})
	db.students.createIndex({student_id:1}, {'unique':true, 'sparse':true}) // sprase - do not use null values in index
	db.students.createIndex({student_id:1}, {'background':true})

	foreground index creation:
	fast, block W/R to whole DB (all collections)
	background index creation:
	slow, no blocks

	explain: find, update, remove, aggregate (wont work with insert)

	db.students.explain().find({student_id: 5}) //COLLSCAN, docsExamined
	db.students.explain(true).find({student_id: 5}) //COLLSCAN, docsExamined

	var exp = db.students.explain()
	exp.help()
	exp.find({'name': 'noo'}).sort('name':1)

	explain verbosity:
	queryPlanner // default
	executionStats
	allPlansExecution

	var exp = db.students.explain('executionStats')
	var exp = db.students.explain('allPlansExecution')

	covered query: query result is satisfied with index (no documents need to be inspected)
	NO db.ijk_data.find({'i':10, 'k':20})
	NO db.ijk_data.find({'i':10, 'k':20}, {'_id': 0})
	YES db.ijk_data.find({'i':10, 'k':20}, {'_id': 0, 'i':1, 'j':1, 'k':1})

	db.examples.stats()
	db.examples.totalIndexSize()

	db.students.find({'scores': { $elemMatch: {'type':'exam', 'score': {$gt: 98}} } })

	db.stores.find({ loc:{ $near: { $geometry: { type: "Point", coordinates: [-130, 39]}, $maxDistance:1000000 } } })

	for (i=0; i<10; i++) { db.ex.insert({'a': i*i}) }
	for (i=0; i<10; i++) { x=[]; for (j=0; j<10; j++) { x.push({"key": i, "val": j}); }; db.ex.insert({'arr': x}); }



	agregation pipeline:
	$project (reshape doc)
	$match
	$group
	$sort (100mb limit to memory based sort, or disk based sort)
	$skip (after sort)
	$limit (after sort)
	$unwind (nornalize, explode data)
	$out (redirect to coll)
	$redact
	$geoner

	$group:
	$sum $avg $min $max
	$push $addtoset (for building arrays)
	$first $last (only with $sort)

	explain: db.zips.aggregate([...], {explain:true})
	allowDiskUse (100mb limit for in-memory aggregation): db.zips.aggregate([...], {allowDiskUse:true})
	cursor

	db.products.aggregate([{$group:{_id:"$manufacturer",num_products:{$sum:1}}}])
	db.products.aggregate([{$group:{_id:{"cat":"$category"},num_products:{$sum:1}}}])
	db.products.aggregate([{$group:{_id:{"maker": "$manufacturer", "cat":"$category"},num_products:{$sum:1}}}])

	db.products.aggregate([{$group:{_id:{"maker": "$manufacturer"}, sum_price:{$sum:"$price"}}}])
	db.zips.aggregate([{$group:{ _id: "$state", "population": {$sum:"$pop"}}}])

	db.zips.aggregate([{$group:{ _id: "$_id", "average_pop": {$avg:"$pop"}}}])
	db.zips.aggregate([{"$group":{"_id":"$state", "average_pop":{"$avg":"$pop"}}}])

	db.zips.aggregate([{$group: {_id: "$city", "postal_codes": {$addToSet: "$_id"}}}])
	db.zips.aggregate([{$group: {_id: "$state", "pop": {$max: "$pop"}}}])

	db.grades.aggregate([
	{'$group':{_id:{class_id:"$class_id", student_id:"$student_id"}, 'average':{"$avg":"$score"}}},
	{'$group':{_id:"$_id.class_id", 'average':{"$avg":"$average"}}}])

	db.products.aggregate([
	{$project: {
	_id:0,
	'maker': {$toLower:"$manufacturer"},
	'details': {'category': "$category",
	'price' : {"$multiply":["$price",10]}
	},
	'item':'$name'
	}}
	])

	db.zips.aggregate([{$project: {_id:0, "city": {$toLower:"$city"}, "pop":1, "state":1, "zip":"$_id"}}])

	db.zips.aggregate([
	{$match:
	{
	state:"NY"
	}
	},
	{$group:
	{
	_id: "$city",
	population: {$sum:"$pop"},
	zip_codes: {$addToSet: "$_id"}
	}
	},
	{$project:
	{
	_id: 0,
	city: "$_id",
	population: 1,
	zip_codes:1
	}
	}

	])

	db.zips.aggregate([{$match:{pop:{$gt:100000}}}])

	db.zips.aggregate([
	{$match: {"state" : "NY"}},
	{$group: {_id: "$city", "pop": {$sum: "$pop"}}},
	{$project: {_id:0, "city": "$_id", pop:1}},
	{$sort: {pop:-1}} ])


	db.zips.aggregate([
	{$match:{pop:{$gt:100000}}}
	{$skip: 4},
	{$limit: 10}
	])

	use agg
	db.zips.aggregate([
	/* get the population of every city in every state */
	{$group:
	{
	_id: {state:"$state", city:"$city"},
	population: {$sum:"$pop"},
	}
	},
	/* sort by state, population */
	{$sort:
	{"_id.state":1, "population":-1}
	},
	/* group by state, get the first item in each group */
	{$group:
	{
	_id:"$_id.state",
	city: {$first: "$_id.city"},
	population: {$first:"$population"}
	}
	}
	])


	db.posts.aggregate([
	/* unwind by tags */
	{"$unwind":"$tags"},
	/* now group by tags, counting each tag */
	{"$group":
	{"_id":"$tags",
	"count":{$sum:1}
	}
	},
	/* sort by popularity */
	{"$sort":{"count":-1}},
	/* show me the top 10 */
	{"$limit": 10},
	/* change the name of _id to be tag */
	{"$project":
	{_id:0,
	'tag':'$_id',
	'count' : 1
	}
	}
	])


	use agg;
	db.inventory.drop();
	db.inventory.insert({'name':"Polo Shirt", 'sizes':["Small", "Medium", "Large"], 'colors':['navy', 'white', 'orange', 'red']})
	db.inventory.insert({'name':"T-Shirt", 'sizes':["Small", "Medium", "Large", "X-Large"], 'colors':['navy', "black", 'orange', 'red']})
	db.inventory.insert({'name':"Chino Pants", 'sizes':["32x32", "31x30", "36x32"], 'colors':['navy', 'white', 'orange', 'violet']})
	db.inventory.aggregate([
	{$unwind: "$sizes"},
	{$unwind: "$colors"},
	{$group:
	{
	'_id': {'size':'$sizes', 'color':'$colors'},
	'count' : {'$sum':1}
	}
	}
	])

	db.products.aggregate([
	{$group:{_id:"$manufacturer",num_products:{$sum:1}}} // output _id must be unique
	{$out: 'collection_name'} // will drop collection if exist !!!
	])


	// hw5.1: find author with most comments
	db.posts.aggregate([
	{$project: {"_id":0, "authors":"$comments.author"}},
	{$unwind: "$authors"},
	{$group: {"_id": "$authors", 'sum': {$sum:1}}},
	{$sort: {"sum": -1}}
	])

	// hw5.2: find avg population in cities of CA and NY with population over 25000
	db.zips.aggregate([
	{$match: {"state": {$in: ["CA", "NY"]} }},
	{$group: { _id:{ "state": "$state", "city": "$city"}, "pop": {$sum: "$pop"} }},
	{$match: {"pop":{$gt: 25000}}},
	{$group: {"_id": null, "avg_pop": {$avg: "$pop"}}}
	])

	// hw5.3: class with highest avg students scores (except quizes)
	db.grades.aggregate([
	{$project:{ _id: 0, "class_id": 1, "student_id":1, "scores": 1}},
	{$unwind: "$scores"},
	{$match: {"scores.type": {$ne: "quiz"}}},
	{$group: {"_id": {"class_id": "$class_id", "student_id": "$student_id"}, "avg_score": {$avg: "$scores.score"}}},
	{$group: {"_id": "$_id.class_id", "avg_scores": {$avg: "$avg_score"}}},
	{$sort: {"avg_scores":-1}}
	])

	// hw5.4: find number of people that live in city that name start with number
	db.zips.aggregate([
	{$match: {"city": {$regex: /^[0-9]/}}},
	{$group: {_id: null, "pop_sum": {$sum: "$pop"}}}
	])




	write concern:
	w - 1 (how many nodes to wait for ack write)
	j - false (do not wait for journal to save to disk)
	wtimeout - ho long to wait for ack
	( all of them may be set at connection, for collection, in rs config)

	replica set:
	1 node is primary (writes and reads), others nodes are secondary (only reads)
	replication is asynchronous
	if primary goes down, on of secondary nodes becomes primary (election)
	minimal number of nodes in replica set: 3

	types of nodes in replica set:
	- regular (has data, can become primary)
	- arbiter (only for voting purposes)
	- delayed (disaster recovery node, set X hour behind other nodes, cannot become primary)
	- hidden (eq. for analytics, cannot become primary)

	strong consistency:
	read and write only from primary nodes

	eventual consistency:
	write to primary, but read from secondary
	you can read what you write, but there may be small delay


	// create replica set
	#!/usr/bin/env bash
	mkdir -p /data/rs1 /data/rs2 /data/rs3
	mongod --replSet m101 --logpath "1.log" --dbpath /data/rs1 --port 27017 --oplogSize 64 --fork --smallfiles
	mongod --replSet m101 --logpath "2.log" --dbpath /data/rs2 --port 27018 --oplogSize 64 --smallfiles --fork
	mongod --replSet m101 --logpath "3.log" --dbpath /data/rs3 --port 27019 --oplogSize 64 --smallfiles --fork


	// init replica set
	config = { _id: "m101", members:[
	{ _id : 0, host : "localhost:27017"},
	{ _id : 1, host : "localhost:27018"},
	{ _id : 2, host : "localhost:27019"} ]
	};
	rs.initiate(config);

	rs.status()
	rs.config() / show rs config
	rs.slaveOk() // allow reads from secondary node
	rs.stepDown() // ??
	rs.help()

	each rs node has oplog (secondary is constantly reading oplog of primary)
	use local
	show collections
	db.oplog.rs.find()

	after failover - node will connect to primary and copy its entire dataset

	proper handling of failover:

	insert: catch AutoReconnect errors: if AutoReconnect occures, sleep X seconds and retry operation

	for retry in range (3):

	try:

	things.insert_one({'_id':i})

	print "Inserted Document: " + str(i)

	time.sleep(.1)

	break

	except pymongo.errors.AutoReconnect as e:

	print "Exception ",type(e), e

	print "Retrying.."

	time.sleep(5)

	except pymongo.errors.DuplicateKeyError as e:

	print "duplicate..but it worked"

	break


	read: retry
	update:
	- idempotent ($set): retry
	- non-idempotent ($inc, $push): ?? depends on application

	read preference:
	- Primary
	- PrimaryPrefered
	- Secondary (only secondary)
	- SecondaryPrefered
	- Nearest (closest pingtime)


	sharding

	mongos - router between sharded hosts
	shared key:
	- every doc must include shared key
	- shared key is immutable
	- shared key must be indexed (or non-multi-key index that starts with shared key)
	- in update shared key must be specified or 'multi'
	- all queries must include shared key, or query will be send to all shards
	- unique key is not allowed (unless it is part of shared key)

	types of sharding:
	- ranged based (s1: 1..100, s2: 101..200, ...)
	- hashed based (hashing over shared key)

	sh.status()



	# final exam

	## Q1

	```javascript
	db.messages.find({'headers.From': 'andrew.fastow@enron.com', 'headers.To': 'jeff.skilling@enron.com'}, {'headers.From':1, 'headers.To':1})
	```

	## Q2

	```javascript
	// zle
	//db.messages.aggregate( {$project: {'_id': 0, 'From':'$headers.From', 'To': '$headers.To'}}, {$unwind: '$To'}, {'$group': {'_id': {'From':'$From', 'To': '$To'}, 'sum': {$sum:1} }}, {$sort: {"sum": -1}} )

	// dobre
	db.messages.aggregate( {$project: {'From':'$headers.From', 'To': '$headers.To'}}, {$unwind: '$To'}, {$group: {_id: {'Msg': '$_id', 'From': '$From'}, 'To': {$addToSet: '$To'}}}, {$unwind: '$To'}, {'$group': {'_id': {'From':'$_id.From', 'To': '$To'}, 'sum': {$sum:1} }}, {$sort: {"sum": -1}} )
	```

	## Q3

	```javascript
	// get 'To' count
	db.messages.aggregate( {$match: {'headers.Message-ID': '<8147308.1075851042335.JavaMail.evans@thyme>'}}, {$project: {'From': '$headers.From', 'To': '$headers.To'}}, {$unwind: '$To'}, {$group: {'_id':null, 'sum': {$sum: 1}}} )
	// update
	db.messages.update({'headers.Message-ID': '<8147308.1075851042335.JavaMail.evans@thyme>'}, {$push: {'headers.To': 'mrpotatohead@mongodb.com'}})
	```

	## Q4

	```python
	try:
	self.posts.update_one({'permalink': permalink}, {'$inc': {"comments.{ord}.num_likes".format(ord=comment_ordinal): 1}})
	except Exception as err:
	print('Error INC likes. Error: {err}'.format(err=err))
	```