Skip to content

Instantly share code, notes, and snippets.

@Fliktrax
Created December 13, 2017 21:03
Show Gist options
  • Save Fliktrax/73984e3c3b71da3241be83b4f214c9b8 to your computer and use it in GitHub Desktop.
Save Fliktrax/73984e3c3b71da3241be83b4f214c9b8 to your computer and use it in GitHub Desktop.
Search and/or Remove Duplicate Addresses From a Mongo Collection
var duplicates = [];
var searchstring = '';
var myCursor = db.household_entries.find().addOption(DBQuery.Option.noTimeout);
myCursor.forEach( function(hhDoc) {
searchstring = hhDoc.full_name + ' ' + hhDoc.address_line_1 + ' ' + hhDoc.post_code_5;
queryString = '\"' + searchstring.split(' ').join('\" \"') + '\"';
var dupArr = [];
var agg =
[
{
"$match": {
"$text": {
"$search": queryString
}
}
},
{
"$project": {
_id: 1,
full_name: 1,
business: 1,
address_line_1: 1,
address_line_2: 1,
city:1,
state_prov:1,
post_code_5: 1,
post_code_4: 1,
"score": {
"$meta": "textScore"
}
}
},
{
"$match": {
"score": { "$gt": 2.0 }
}
}
];
db.household_entries.aggregate(agg).forEach(function(dupDoc) {
dupArr.push(dupDoc._id);
});
if(dupArr.length > 1){
dupArr.shift();
db.household_entries.remove({_id : {$in: dupArr }});
}
} );
@Fliktrax
Copy link
Author

Fliktrax commented Dec 13, 2017

Assumes wildcard text index on collection:

https://docs.mongodb.com/v3.4/core/index-text/#create-text-index

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment