Skip to content

Instantly share code, notes, and snippets.

@j-coll
Last active March 9, 2017 11:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j-coll/70896e9efb17bda299800d423741fce3 to your computer and use it in GitHub Desktop.
Save j-coll/70896e9efb17bda299800d423741fce3 to your computer and use it in GitHub Desktop.
function migrateCollection(collection, query, projection, migrateFunc) {
var bulk = db.getCollection(collection).initializeOrderedBulkOp();
var count = 0;
var bulkSize = 500;
db.getCollection(collection).find(query,projection).forEach(function(doc) {
migrateFunc(bulk, doc);
if ( bulk.nUpdateOps + bulk.nInsertOps + bulk.nRemoveOps >= bulkSize ) {
count += bulk.nUpdateOps + bulk.nInsertOps + bulk.nRemoveOps;
print("Execute bulk! " + count);
bulk.execute();
bulk = db.getCollection(collection).initializeOrderedBulkOp();
}
});
if ( bulk.nUpdateOps + bulk.nInsertOps + bulk.nRemoveOps > 0 ) {
count += bulk.nUpdateOps + bulk.nInsertOps + bulk.nRemoveOps;
print("Execute bulk! " + count);
bulk.execute();
bulk = db.getCollection(collection).initializeOrderedBulkOp();
}
if (count == 0) {
print("Nothing to do!");
}
}
function normalize_single(start, ref, alt) {
var idx = reverseIndexOfDifference(ref, alt);
if (idx > 0) {
ref = ref.substring(0, ref.length - idx);
alt = alt.substring(0, alt.length - idx);
}
idx = indexOfDifference(ref, alt);
if (idx > 0) {
start += idx;
ref = ref.substring(idx);
alt = alt.substring(idx);
}
return start + ":" + ref + ":" + alt;
}
function normalize(ori) {
var splitOri = ori.split(":");
var start = parseInt(splitOri[0]);
var ref = splitOri[1];
var alts = splitOri[2];
var normalized = [];
alts.split(",").forEach(function (alt) {
normalized.push(normalize_single(start, ref, alt));
});
return normalized;
}
function indexOfDifference(cs1, cs2) {
if (cs1 == cs2) {
return -1;
}
if (cs1 == null || cs2 == null) {
return 0;
}
var i;
for (i = 0; i < cs1.length && i < cs2.length; ++i) {
if (cs1[i] != cs2[i]) {
break;
}
}
if (i < cs2.length || i < cs1.length) {
return i;
}
return -1;
}
function reverseIndexOfDifference(cs1, cs2) {
if (cs1 == cs2) {
return -1;
}
if (cs1 == null || cs2 == null) {
return 0;
}
var i;
var cs1Length = cs1.length;
var cs2Length = cs2.length;
for (i = 0; i < cs1Length && i < cs2Length; ++i) {
if (cs1[cs1Length - i - 1] != cs2[cs2Length - i - 1]) {
break;
}
}
if (i < cs2Length || i < cs1Length) {
return i;
}
return -1;
}
var scannedVariants = 0;
var totalVariants = db.variants.count();
var modifiedVariants = 0;
var modifiedFiles = 0;
migrateCollection("variants", {}, {chromosome:1, start:1, reference:1, alternate:1, "studies.sid":1, "studies.files.fid":1, "studies.files._ori":1},
function (bulk, v) {
var mainVar = v.start + ":" + v.reference + ":" + v.alternate;
var set = {};
var executeUpdate = false;
if (v.hasOwnProperty("studies")) {
for (var sidx in v.studies) {
var study = v.studies[sidx];
for (var fidx in study.files) {
var file = study.files[fidx];
if (file.hasOwnProperty("_ori")) {
var norm = normalize(file._ori.s);
if (norm.indexOf(mainVar) == -1) {
var fid = file.fid;
if (fid > 0) {
fid = -fid;
executeUpdate = true;
modifiedFiles++;
set["studies."+sidx+".files."+fidx+".fid"] = NumberInt(fid);
}
/*print(mainVar + " at file " + file.fid + " with ori '" + file._ori.s + "', normalized : '" + norm + "'");*/
} else {
/*print("-------" + mainVar + " at file " + file.fid + " with ori '" + file._ori.s + "', normalized : '" + norm + "'");*/
}
}
}
}
}
scannedVariants++;
if (executeUpdate) {
modifiedVariants++;
bulk.find({_id: v._id}).updateOne({$set: set});
}
if (scannedVariants % 10000 == 0) {
print("Processed " + scannedVariants + "/" + totalVariants + " " + (scannedVariants/totalVariants*100).toPrecision(3) + "% variants. Updated " + modifiedFiles + " files from " + modifiedVariants + " variants");
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment