Skip to content

Instantly share code, notes, and snippets.

@danielkraic
Created February 24, 2017 13:26
Show Gist options
  • Save danielkraic/1dd467e21db9a301a9ca2076aed7ddb6 to your computer and use it in GitHub Desktop.
Save danielkraic/1dd467e21db9a301a9ca2076aed7ddb6 to your computer and use it in GitHub Desktop.
db.data.drop()
db.data.insertOne({"name": "val1"})
db.data.insertOne({"_id": "id100", "name": "val1"})
db.data.insertMany([
{ "name": "val1" },
{ "name": "val2" }
]
)
db.data.insertMany( [
{ "name": "val1" },
{ "name": "val2" }
],
{ "ordered": false}
)
_id, type of ObjectId() by default
12-byte hex string: 4: date + 3: mac addr + 2: process_id + 3: counter
db.data.find()
db.data.find().pretty()
db.data.find({"rated": "PG-13"})
db.data.find({"rated": "PG-13"}).count()
db.data.find({"rated": "PG-13", "year": 2009})
db.data.find({"tomato.meter": 100})
db.data.find({"writers": ["Eathan", "Joel"]}) // MATCH ENTIRE ARRAY
db.data.find({"actors": "Brad"}) // MATCH ANY ELEMENT OF ARRAY
db.data.find({"actors.0": "Brad"}) // MATCH FIRST ELEMENT OF ARRAY
// cursor
var c = db.data.find()
var doc = function() { return c.hasNext() ? c.next() : null; }
doc()
c.objsLeftInBadge()
// projection
db.data.find({"rated": "PG-13"}, { title: 1, _id: 0 })
// query operators
// comparsion $eq, $ne, $lt, $lte, $gt, $gte, $in, $nin
db.data.find({"rated": { $ne: "UNRATED" }})
db.data.find({"runtime": { $gt: 90 }})
db.data.find({"runtime": { $gte: 90, $lte: 120 }})
db.data.find({"tomato.meter": {$gt: 95}, "runtime": { $gte: 90, $lte: 120 }})
db.data.find({"rated": { $in: ["G", "PG"] }})
// element $exists, $type
db.data.find({"tomate.meter": { $exists: true }}).count()
db.data.find({"tomate.meter": { $exists: false }}).count()
.db.data.find({"_id": { $type: "string" }})
// logical operators $or $and $not $nor
db.data.find( { $or: [ {"tomato.eter": { $gte: 95}, { "metacritic": $gt: 88} ] } )
db.data.find({ $and: [ {"metacritic: {$not: null}}, {"metacritic": {$exists: true}} ] })
// regex
d.data.find({ "awards.text": { $regex: /^Won\s.*/ } })
d.data.find({ "awards.text": { $regex: '^Won\s.*' } })
d.data.find({ "awards.text": { $regex: '^Won\s.*', $options: "i" } })
// arays operator
{ "genre": { $all: ["Comedy", "Crime", "Drama"] } }
{ "country": { $size: 1 } }
{ "boxoffice": { $elemMatch: { "country": "UK", "revenue": { $gt: 15} } } }
db.movieDetails.find({'awards.oscars.award': 'bestPicture'})
db.movieDetails.updateMany({ year: {$gte: 2010, $lte: 2013},
"imdb.votes": {$lt: 10000},
$and: [{"tomato.consensus": {$exists: true} },
{"tomato.consensus": null} ] },
{ $unset: { "tomato.consensus": "" } });
// update
db.data.updateOne({"title": "The Martian"}, {$set: {"poster": "http://d.c"})
db.data.updateOne({"title": "The Martian"}, {$set: {"awards": {"wins": 0, "nominations": 2}}})
// update operators
// $set, $unset, $rename, $min, $max, $inc, $mul, ...
db.data.updateOne({"title": "Martian"}, {$inc: {"tomao.revies": 4, "tomato.userReviews: 3}})
// arrays operators: $addToSet $pop $pull $pullAll $push
db.data.updateOne(
{"title": "Martian"},
{ $push : { "reviews":
{ "rating": 1, "author": "Someone", "text": ".."}
}
}
)
db.data.updateOne(
{"title": "Martian"},
{ $push : { "reviews":
{ $each: [
{ "rating": 1, "author": "Someone", "text": ".."},
{ "rating": 2, "author": "Someone2", "text": ".."}
]
}
}
}
)
// update many
db.data.updateMany({"rated": null}, {$set: {"rated": "UNRATED"} })
db.data.updateMany({"rated": null}, {$unset: {"rated": ""} })
// upsert
db.data.update({...}, {...}, "upsert": true)
// replaceOne
db.data.replaceOne()
SORT -> SKIP -> LIMIT
HOMEWORK
1:
db.grades.find({'score': {$gte: 65}}, {'_id': 0, 'student_id': 1}).sort({'score': 1}).limit(1)
2:
```python
for doc in m.students.grades.find({'type': 'homework'}).sort([('student_id', pymongo.ASCENDING),('score', pymongo.ASCENDING)]):
if last_student != doc['student_id']:
counter += 1
last_student = doc['student_id']
g.remove({'_id': doc['_id']})
```
3:
ch4:
// hint to use index
db.students.find({student_id:{$gt:500000}, class_id:54}).sort({student_id:1}).hint({class_id:1}).explain("executionstats")
// profiling
db.getProfilingStatus()
db.getProfilingLevel()
db.setProfilingLevel(0) // do not log queries
db.setProfilingLevel(1, 4) // log queries slower than 4s
db.setProfilingLevel(2) // log all queries
db.system.profile.find()
db.system.profile.find().sort({ts:1})
db.system.profile.find({ns:/school.students/})
db.system.profile.find({millis: {$lt:1}}, {ts:1, millis:1}).sort({ts:-1})
mongotop
mongotop 5 // 5 seconds refresh
mongotop
mongostat
storage engines
mmap v1:
collection level locking
in place updates
power of two sizes allocations
wired tiger:
document level concurrency (no locking is needed)
compression (documents data, indexes)
append only (no inplace updates)
killall mongod
mkdir WT
mongod -dbpath WT -storageEngine wiredTiger
mongod --dbpath WT --storageEngine wiredTiger -wiredTigerIndexPrefixCompression true
db.foo.stats()
index: btree
db.students.createIndex({student_id:1})
db.students.createIndex({student_id:1, class_id:-1})
db.students.createIndex({location:'2d'})
db.students.createIndex({location:'2dsphere'})
// full text search
db.students.createIndex({location:'text'})
db.data.find({'$text': {'$search': 'dog cat'}}) //find dog or cat in document
db.students.getIndexes()
db.students.dropIndex({student_id:1})
// multikey indexes - on array
db.students.createIndex({student_id:1}, {'unique':true})
db.students.createIndex({student_id:1}, {'unique':true, 'sparse':true}) // sprase - do not use null values in index
db.students.createIndex({student_id:1}, {'background':true})
foreground index creation:
fast, block W/R to whole DB (all collections)
background index creation:
slow, no blocks
explain: find, update, remove, aggregate (wont work with insert)
db.students.explain().find({student_id: 5}) //COLLSCAN, docsExamined
db.students.explain(true).find({student_id: 5}) //COLLSCAN, docsExamined
var exp = db.students.explain()
exp.help()
exp.find({'name': 'noo'}).sort('name':1)
explain verbosity:
queryPlanner // default
executionStats
allPlansExecution
var exp = db.students.explain('executionStats')
var exp = db.students.explain('allPlansExecution')
covered query: query result is satisfied with index (no documents need to be inspected)
NO db.ijk_data.find({'i':10, 'k':20})
NO db.ijk_data.find({'i':10, 'k':20}, {'_id': 0})
YES db.ijk_data.find({'i':10, 'k':20}, {'_id': 0, 'i':1, 'j':1, 'k':1})
db.examples.stats()
db.examples.totalIndexSize()
db.students.find({'scores': { $elemMatch: {'type':'exam', 'score': {$gt: 98}} } })
db.stores.find({ loc:{ $near: { $geometry: { type: "Point", coordinates: [-130, 39]}, $maxDistance:1000000 } } })
for (i=0; i<10; i++) { db.ex.insert({'a': i*i}) }
for (i=0; i<10; i++) { x=[]; for (j=0; j<10; j++) { x.push({"key": i, "val": j}); }; db.ex.insert({'arr': x}); }
agregation pipeline:
$project (reshape doc)
$match
$group
$sort (100mb limit to memory based sort, or disk based sort)
$skip (after sort)
$limit (after sort)
$unwind (nornalize, explode data)
$out (redirect to coll)
$redact
$geoner
$group:
$sum $avg $min $max
$push $addtoset (for building arrays)
$first $last (only with $sort)
explain: db.zips.aggregate([...], {explain:true})
allowDiskUse (100mb limit for in-memory aggregation): db.zips.aggregate([...], {allowDiskUse:true})
cursor
db.products.aggregate([{$group:{_id:"$manufacturer",num_products:{$sum:1}}}])
db.products.aggregate([{$group:{_id:{"cat":"$category"},num_products:{$sum:1}}}])
db.products.aggregate([{$group:{_id:{"maker": "$manufacturer", "cat":"$category"},num_products:{$sum:1}}}])
db.products.aggregate([{$group:{_id:{"maker": "$manufacturer"}, sum_price:{$sum:"$price"}}}])
db.zips.aggregate([{$group:{ _id: "$state", "population": {$sum:"$pop"}}}])
db.zips.aggregate([{$group:{ _id: "$_id", "average_pop": {$avg:"$pop"}}}])
db.zips.aggregate([{"$group":{"_id":"$state", "average_pop":{"$avg":"$pop"}}}])
db.zips.aggregate([{$group: {_id: "$city", "postal_codes": {$addToSet: "$_id"}}}])
db.zips.aggregate([{$group: {_id: "$state", "pop": {$max: "$pop"}}}])
db.grades.aggregate([
{'$group':{_id:{class_id:"$class_id", student_id:"$student_id"}, 'average':{"$avg":"$score"}}},
{'$group':{_id:"$_id.class_id", 'average':{"$avg":"$average"}}}])
db.products.aggregate([
{$project: {
_id:0,
'maker': {$toLower:"$manufacturer"},
'details': {'category': "$category",
'price' : {"$multiply":["$price",10]}
},
'item':'$name'
}}
])
db.zips.aggregate([{$project: {_id:0, "city": {$toLower:"$city"}, "pop":1, "state":1, "zip":"$_id"}}])
db.zips.aggregate([
{$match:
{
state:"NY"
}
},
{$group:
{
_id: "$city",
population: {$sum:"$pop"},
zip_codes: {$addToSet: "$_id"}
}
},
{$project:
{
_id: 0,
city: "$_id",
population: 1,
zip_codes:1
}
}
])
db.zips.aggregate([{$match:{pop:{$gt:100000}}}])
db.zips.aggregate([
{$match: {"state" : "NY"}},
{$group: {_id: "$city", "pop": {$sum: "$pop"}}},
{$project: {_id:0, "city": "$_id", pop:1}},
{$sort: {pop:-1}} ])
db.zips.aggregate([
{$match:{pop:{$gt:100000}}}
{$skip: 4},
{$limit: 10}
])
use agg
db.zips.aggregate([
/* get the population of every city in every state */
{$group:
{
_id: {state:"$state", city:"$city"},
population: {$sum:"$pop"},
}
},
/* sort by state, population */
{$sort:
{"_id.state":1, "population":-1}
},
/* group by state, get the first item in each group */
{$group:
{
_id:"$_id.state",
city: {$first: "$_id.city"},
population: {$first:"$population"}
}
}
])
db.posts.aggregate([
/* unwind by tags */
{"$unwind":"$tags"},
/* now group by tags, counting each tag */
{"$group":
{"_id":"$tags",
"count":{$sum:1}
}
},
/* sort by popularity */
{"$sort":{"count":-1}},
/* show me the top 10 */
{"$limit": 10},
/* change the name of _id to be tag */
{"$project":
{_id:0,
'tag':'$_id',
'count' : 1
}
}
])
use agg;
db.inventory.drop();
db.inventory.insert({'name':"Polo Shirt", 'sizes':["Small", "Medium", "Large"], 'colors':['navy', 'white', 'orange', 'red']})
db.inventory.insert({'name':"T-Shirt", 'sizes':["Small", "Medium", "Large", "X-Large"], 'colors':['navy', "black", 'orange', 'red']})
db.inventory.insert({'name':"Chino Pants", 'sizes':["32x32", "31x30", "36x32"], 'colors':['navy', 'white', 'orange', 'violet']})
db.inventory.aggregate([
{$unwind: "$sizes"},
{$unwind: "$colors"},
{$group:
{
'_id': {'size':'$sizes', 'color':'$colors'},
'count' : {'$sum':1}
}
}
])
db.products.aggregate([
{$group:{_id:"$manufacturer",num_products:{$sum:1}}} // output _id must be unique
{$out: 'collection_name'} // will drop collection if exist !!!
])
// hw5.1: find author with most comments
db.posts.aggregate([
{$project: {"_id":0, "authors":"$comments.author"}},
{$unwind: "$authors"},
{$group: {"_id": "$authors", 'sum': {$sum:1}}},
{$sort: {"sum": -1}}
])
// hw5.2: find avg population in cities of CA and NY with population over 25000
db.zips.aggregate([
{$match: {"state": {$in: ["CA", "NY"]} }},
{$group: { _id:{ "state": "$state", "city": "$city"}, "pop": {$sum: "$pop"} }},
{$match: {"pop":{$gt: 25000}}},
{$group: {"_id": null, "avg_pop": {$avg: "$pop"}}}
])
// hw5.3: class with highest avg students scores (except quizes)
db.grades.aggregate([
{$project:{ _id: 0, "class_id": 1, "student_id":1, "scores": 1}},
{$unwind: "$scores"},
{$match: {"scores.type": {$ne: "quiz"}}},
{$group: {"_id": {"class_id": "$class_id", "student_id": "$student_id"}, "avg_score": {$avg: "$scores.score"}}},
{$group: {"_id": "$_id.class_id", "avg_scores": {$avg: "$avg_score"}}},
{$sort: {"avg_scores":-1}}
])
// hw5.4: find number of people that live in city that name start with number
db.zips.aggregate([
{$match: {"city": {$regex: /^[0-9]/}}},
{$group: {_id: null, "pop_sum": {$sum: "$pop"}}}
])
write concern:
w - 1 (how many nodes to wait for ack write)
j - false (do not wait for journal to save to disk)
wtimeout - ho long to wait for ack
( all of them may be set at connection, for collection, in rs config)
replica set:
1 node is primary (writes and reads), others nodes are secondary (only reads)
replication is asynchronous
if primary goes down, on of secondary nodes becomes primary (election)
minimal number of nodes in replica set: 3
types of nodes in replica set:
- regular (has data, can become primary)
- arbiter (only for voting purposes)
- delayed (disaster recovery node, set X hour behind other nodes, cannot become primary)
- hidden (eq. for analytics, cannot become primary)
strong consistency:
read and write only from primary nodes
eventual consistency:
write to primary, but read from secondary
you can read what you write, but there may be small delay
// create replica set
#!/usr/bin/env bash
mkdir -p /data/rs1 /data/rs2 /data/rs3
mongod --replSet m101 --logpath "1.log" --dbpath /data/rs1 --port 27017 --oplogSize 64 --fork --smallfiles
mongod --replSet m101 --logpath "2.log" --dbpath /data/rs2 --port 27018 --oplogSize 64 --smallfiles --fork
mongod --replSet m101 --logpath "3.log" --dbpath /data/rs3 --port 27019 --oplogSize 64 --smallfiles --fork
// init replica set
config = { _id: "m101", members:[
{ _id : 0, host : "localhost:27017"},
{ _id : 1, host : "localhost:27018"},
{ _id : 2, host : "localhost:27019"} ]
};
rs.initiate(config);
rs.status()
rs.config() / show rs config
rs.slaveOk() // allow reads from secondary node
rs.stepDown() // ??
rs.help()
each rs node has oplog (secondary is constantly reading oplog of primary)
use local
show collections
db.oplog.rs.find()
after failover - node will connect to primary and copy its entire dataset
proper handling of failover:
insert: catch AutoReconnect errors: if AutoReconnect occures, sleep X seconds and retry operation
for retry in range (3):
try:
things.insert_one({'_id':i})
print "Inserted Document: " + str(i)
time.sleep(.1)
break
except pymongo.errors.AutoReconnect as e:
print "Exception ",type(e), e
print "Retrying.."
time.sleep(5)
except pymongo.errors.DuplicateKeyError as e:
print "duplicate..but it worked"
break
read: retry
update:
- idempotent ($set): retry
- non-idempotent ($inc, $push): ?? depends on application
read preference:
- Primary
- PrimaryPrefered
- Secondary (only secondary)
- SecondaryPrefered
- Nearest (closest pingtime)
sharding
mongos - router between sharded hosts
shared key:
- every doc must include shared key
- shared key is immutable
- shared key must be indexed (or non-multi-key index that starts with shared key)
- in update shared key must be specified or 'multi'
- all queries must include shared key, or query will be send to all shards
- unique key is not allowed (unless it is part of shared key)
types of sharding:
- ranged based (s1: 1..100, s2: 101..200, ...)
- hashed based (hashing over shared key)
sh.status()
# final exam
## Q1
```javascript
db.messages.find({'headers.From': 'andrew.fastow@enron.com', 'headers.To': 'jeff.skilling@enron.com'}, {'headers.From':1, 'headers.To':1})
```
## Q2
```javascript
// zle
//db.messages.aggregate( {$project: {'_id': 0, 'From':'$headers.From', 'To': '$headers.To'}}, {$unwind: '$To'}, {'$group': {'_id': {'From':'$From', 'To': '$To'}, 'sum': {$sum:1} }}, {$sort: {"sum": -1}} )
// dobre
db.messages.aggregate( {$project: {'From':'$headers.From', 'To': '$headers.To'}}, {$unwind: '$To'}, {$group: {_id: {'Msg': '$_id', 'From': '$From'}, 'To': {$addToSet: '$To'}}}, {$unwind: '$To'}, {'$group': {'_id': {'From':'$_id.From', 'To': '$To'}, 'sum': {$sum:1} }}, {$sort: {"sum": -1}} )
```
## Q3
```javascript
// get 'To' count
db.messages.aggregate( {$match: {'headers.Message-ID': '<8147308.1075851042335.JavaMail.evans@thyme>'}}, {$project: {'From': '$headers.From', 'To': '$headers.To'}}, {$unwind: '$To'}, {$group: {'_id':null, 'sum': {$sum: 1}}} )
// update
db.messages.update({'headers.Message-ID': '<8147308.1075851042335.JavaMail.evans@thyme>'}, {$push: {'headers.To': 'mrpotatohead@mongodb.com'}})
```
## Q4
```python
try:
self.posts.update_one({'permalink': permalink}, {'$inc': {"comments.{ord}.num_likes".format(ord=comment_ordinal): 1}})
except Exception as err:
print('Error INC likes. Error: {err}'.format(err=err))
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment