Created
April 10, 2017 13:46
-
-
Save stephkoltun/d3312d609c44f1e4008a150881c19230 to your computer and use it in GitHub Desktop.
Atomizing text prior to uploading it to a database as various objects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var processUpload = function(err, data, callback) { | |
var file = data; | |
var masterObject = {}; | |
var uploadObj = { | |
_id: file.originalname.substring(0, 4), | |
servPath: file.path, | |
fileName: file.originalname, | |
inputString: fs.readFileSync(file.path, "utf8"), | |
}; | |
for (var i = 0; i < docMetadata.length; i++) { | |
if (docMetadata[i].FileName == file.originalname) { | |
var matchFile = docMetadata[i]; | |
console.log("matched file!"); | |
//console.log(matchFile); | |
uploadObj.timestamp = new Date(Date.parse(matchFile.DateCreated)); | |
uploadObj.name = matchFile.FileName.substring(5).replace(".md", ""); | |
uploadObj.path = matchFile.FilePath.replace(/( - )|( )/g, "_"); | |
uploadObj.path = uploadObj.path.split(":"); | |
masterObject.uploadObj = uploadObj; | |
break; | |
} else { | |
if (i == docMetadata.length - 1) { | |
console.log("did not find match data for " + uploadObj.fileName); | |
} | |
} | |
} | |
// this routes to tokenizeInput | |
callback(null, masterObject, tokenizeInput); | |
} | |
var stripString = function(err, data, callback) { | |
var masterObject = data; | |
var uploadObj = masterObject.uploadObj; | |
console.log("cleaning string"); | |
var str = uploadObj.inputString.replace(/;(?=(\n)|( \n))/g, "."); | |
str = str.replace(/;$/g, "."); // semi at end of line | |
str = str.replace(/:(?=(\n))/g, "."); // colon then new line | |
str = str.replace(/’(?=(\n))/g, "’."); // right quote then new line | |
str = str.replace(/ (?=(\n))/g, ""); | |
str = str.replace(/_(?=(\n)|( \n))/g, "_.\n") // underscore then new line | |
str = str.replace(/\b\n/g, ".\n"); // word boundary then new line | |
str = str.replace(/\n\n/g, "\n"); | |
str = str.replace(/\n\n\n/g, "\n"); | |
str = str.replace(/\n\n\n\n/g, "\n"); | |
str = str.replace(/\t/g, ""); | |
str = contractions.expand(str); | |
masterObject.cleanString = str; | |
// this routes to tokenizeInput | |
callback(null, masterObject, createDocObject); | |
}; | |
var tokenizeInput = function(err, data, callback) { | |
var masterObject = data; | |
console.log("start tokenizing"); | |
var cleanString = masterObject.cleanString; | |
// initialize Tokenizer | |
var wordTokenizer = new natural.WordTokenizer(); | |
var sentenceTokenizer = new sentTokenizerOnly(); | |
var tokens = {}; | |
//tokens | |
tokens.wordsTokens = wordTokenizer.tokenize(cleanString); | |
sentenceTokenizer.setEntry(cleanString); | |
tokens.sentTokens = sentenceTokenizer.getSentences(); | |
tokens.wordsPerSentTokens = []; | |
for (i = 0; i < tokens.sentTokens.length; i++) { | |
tokens.wordsPerSentTokens[i] = wordTokenizer.tokenize(tokens.sentTokens[i]); | |
} | |
masterObject.tokens = tokens; | |
// this routes to createDocObject | |
callback(null, masterObject, createSentObject); | |
} | |
var createDocObject = function(err, data, callback) { | |
console.log("create doc object"); | |
var masterObject = data; | |
var uploadObj = masterObject.uploadObj; | |
var docObj = {}; | |
// inherited properties | |
docObj.timestamp = uploadObj.timestamp; | |
docObj.path = uploadObj.path; | |
docObj.name = uploadObj.name; | |
// unique properties | |
docObj.dbPath = "documents"; | |
docObj._id = "d" + uploadObj._id; // add prefix | |
docObj.docString = masterObject.cleanString; | |
docObj.docSentIDArray = []; | |
docObj.docWordIDArray = []; | |
// geneology | |
docObj.parentID = []; // complete after | |
docObj.childID = []; // complete after | |
// assign docObj to master for passing to callback | |
masterObject.docObj = docObj; | |
// this routes to createSentObject | |
callback(null, masterObject, addAdjacentSents); | |
} | |
var createSentObject = function(err, data, callback) { | |
console.log("create sentence objects"); | |
var masterObject = data; | |
var uploadObj = masterObject.uploadObj; | |
var tokens = masterObject.tokens; | |
var docObj = masterObject.docObj; | |
var sentences = tokens.sentTokens; | |
var sentObjs = []; | |
// process each sentence | |
for (var i = 0; i < sentences.length; i++) { | |
var thisSentence = {}; | |
// inherited | |
thisSentence.timestamp = uploadObj.timestamp; | |
thisSentence.path = uploadObj.path; | |
thisSentence.name = uploadObj.name; | |
thisSentence.docID = docObj._id; | |
// unique | |
thisSentence.dbPath = "sentences"; | |
thisSentence._id = "s" + uploadObj._id + i; | |
// add sentence ID to document Object | |
docObj.docSentIDArray[i] = thisSentence._id; | |
// content | |
thisSentence.sentString = sentences[i]; | |
thisSentence.sentWordIDArray = []; // add after | |
// look for special characters | |
// start with special character defaulting as false | |
if (/^[#\?`'>\*~\|%:\\\[\+_-]/.test(thisSentence.sentString)) { | |
thisSentence.specChar = true; | |
} else { | |
thisSentence.specChar = false; | |
} | |
// add any unique identifiers | |
if (/^#/.test(thisSentence.sentString)) { | |
thisSentence.pound = true; | |
} else { | |
thisSentence.pound = false; | |
} | |
if (/^\?/.test(thisSentence.sentString)) { | |
thisSentence.quest = true; | |
} else { | |
thisSentence.quest = false; | |
} | |
if (/^`/.test(thisSentence.sentString)) { | |
thisSentence.apos = true; | |
} else { | |
thisSentence.apos = false; | |
} | |
if (/^'/.test(thisSentence.sentString)) { | |
thisSentence.quo = true; | |
} else { | |
thisSentence.quo = false; | |
} | |
if (/^>/.test(thisSentence.sentString)) { | |
thisSentence.arrow = true; | |
} else { | |
thisSentence.arrow = false; | |
} | |
if (/^\*/.test(thisSentence.sentString)) { | |
thisSentence.star = true; | |
} else { | |
thisSentence.star = false; | |
} | |
if (/^~/.test(thisSentence.sentString)) { | |
thisSentence.squig = true; | |
} else { | |
thisSentence.squig = false; | |
} | |
if (/^\|/.test(thisSentence.sentString)) { | |
thisSentence.vert = true; | |
} else { | |
thisSentence.vert = false; | |
} | |
if (/^%/.test(thisSentence.sentString)) { | |
thisSentence.percent = true; | |
} else { | |
thisSentence.percent = false; | |
} | |
if (/^:/.test(thisSentence.sentString)) { | |
thisSentence.colon = true; | |
} else { | |
thisSentence.colon = false; | |
} | |
if (/^\\/.test(thisSentence.sentString)) { | |
thisSentence.slash = true; | |
} else { | |
thisSentence.slash = false; | |
} | |
if (/^\[/.test(thisSentence.sentString)) { | |
thisSentence.square = true; | |
} else { | |
thisSentence.square = false; | |
} | |
if (/^\+/.test(thisSentence.sentString)) { | |
thisSentence.plus = true; | |
} else { | |
thisSentence.plus = false; | |
} | |
if (/^_/.test(thisSentence.sentString)) { | |
thisSentence.under = true; | |
} else { | |
thisSentence.under = false; | |
} | |
if (/^-/.test(thisSentence.sentString)) { | |
thisSentence.dash = true; | |
} else { | |
thisSentence.dash = false; | |
} | |
// add clean string | |
thisSentence.cleanString = thisSentence.sentString.replace(/^# /g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\?\?)|(^\? )|(^\?)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^``)|(^` )|(^`)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^'')|(^' )|(^')/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^> )|(^>)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\*\*)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\* )|(^\*)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^~~)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^~ )|(^~)|/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\|\| )/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\| )|(^\|)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^%%)|(^% )|(^%)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^::)|(^: )|(^:)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\\\\)|(^\\ )|(^\\)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\[ )|(^\[)|(^\[\[)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\+\+)|(^\+ )|(^\+)/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/_/g, ""); | |
thisSentence.cleanString = thisSentence.cleanString.replace(/(^- )|(^-)/g, ""); | |
// tokenize clean string | |
var wordTokenizer = new natural.WordTokenizer(); | |
thisSentence.sentWordArray = wordTokenizer.tokenize(thisSentence.cleanString); | |
// context | |
thisSentence.leftID = null; | |
thisSentence.rightID = null; | |
// geneology | |
thisSentence.parentID = []; // complete after | |
thisSentence.childID = []; // complete after | |
// sequence and position | |
thisSentence.sentSequence = i; | |
thisSentence.sentTotal = sentences.length - 1; | |
if (i == 0) { | |
thisSentence.first = true; | |
} else { | |
thisSentence.first = false; | |
} | |
if (i == sentences.length - 1) { | |
thisSentence.last = true; | |
} else { | |
thisSentence.last = false; | |
} | |
// add sentence to array | |
sentObjs[i] = thisSentence; | |
} | |
// assign sentence objects to master data object | |
masterObject.sentObjs = sentObjs; | |
// this routes to addAdjacentSents | |
callback(null, masterObject, createWordObjs); | |
} | |
var addAdjacentSents = function(err, data, callback) { | |
console.log("add adjacent sentence IDs"); | |
var masterObject = data; | |
var sentObjs = masterObject.sentObjs; | |
if (sentObjs.length > 1) { | |
for (var i = 0; i < sentObjs.length; i++) { | |
thisSent = sentObjs[i]; | |
if (i == 0) { | |
thisSent.leftID = null; | |
thisSent.rightID = sentObjs[i + 1]._id; | |
} else if (i == sentObjs.length - 1) { | |
thisSent.rightID = null; | |
thisSent.leftID = sentObjs[sentObjs.length - 1]._id; | |
} else { | |
thisSent.rightID = sentObjs[i + 1]._id; | |
thisSent.leftID = sentObjs[i - 1]._id; | |
} | |
} | |
} else { | |
sentObjs[0].leftID = null; | |
sentObjs[0].rightID = null; | |
} | |
// routes to createWordObjs | |
callback(null, masterObject, addAdjacentWords); | |
} | |
var createWordObjs = function(err, data, callback) { | |
console.log("create word objs for each sentence"); | |
var masterObject = data; | |
var tokens = masterObject.tokens; | |
var uploadObj = masterObject.uploadObj; | |
var docObj = masterObject.docObj; | |
var sentObjs = masterObject.sentObjs; | |
var allWords = []; | |
for (var i = 0; i < sentObjs.length; i++) { | |
var thisSent = sentObjs[i]; | |
var wordArray = thisSent.sentWordArray; | |
// word objs for this sentence | |
var wordObjs = []; | |
// track num of words in prev sentence | |
var prevSentLength = 0; | |
var curSentNum = i; | |
if (curSentNum > 0) { | |
// all the previous lengths = nums less than i | |
for (var prevSent = 0; prevSent < curSentNum; prevSent++) { | |
prevSentLength += sentObjs[prevSent].sentWordArray.length; | |
} | |
} | |
for (var k = 0; k < wordArray.length; k++) { | |
var thisWord = {}; | |
// inherited | |
thisWord.timestamp = uploadObj.timestamp; | |
thisWord.path = uploadObj.path; | |
thisWord.name = uploadObj.name; | |
thisWord.docID = docObj._id; | |
thisWord.sentID = thisSent._id; | |
thisWord.sentSequence = thisSent.sentSequence; | |
thisWord.sentTotal = thisSent.sentTotal; | |
// unique | |
thisWord.dbPath = "words"; | |
thisWord._id = "w" + thisSent._id.substring(1) + k; | |
// assign back to sentence | |
thisSent.sentWordIDArray[k] = (thisWord._id); | |
// content | |
thisWord.string = wordArray[k]; | |
// context | |
thisWord.leftID = null; | |
thisWord.rightID = null; | |
// geneology | |
thisWord.parentID = []; // complete after | |
thisWord.childID = []; // complete after | |
// sequence and position in sentence | |
thisWord.wordSentSequence = k; | |
thisWord.wordSentTotal = wordArray.length - 1; | |
if (k == 0) { | |
thisWord.first = true; | |
} else { | |
thisWord.first = false; | |
} | |
if (k == wordArray.length - 1) { | |
thisWord.last = true; | |
} else { | |
thisWord.last = false; | |
} | |
// sequence in document | |
thisWord.wordDocSequence = k + prevSentLength; | |
thisWord.wordDocTotal = tokens.wordsTokens.length - 1; | |
// add sentence to array | |
wordObjs[k] = thisWord; | |
allWords.push(thisWord); | |
} | |
} | |
masterObject.wordObjs = allWords; | |
// routes to addAdjacentWords | |
callback(null, masterObject, addWordIDtoDoc); | |
} | |
var addAdjacentWords = function(err, data, callback) { | |
console.log("add adjacent words"); | |
var masterObject = data; | |
var wordObjs = masterObject.wordObjs; | |
for (var i = 0; i < wordObjs.length; i++) { | |
thisWord = wordObjs[i]; | |
// make sure this isn't the only word in the sentence | |
if (thisWord.wordSentTotal > 0) { | |
if (thisWord.first) { | |
thisWord.leftID = null; | |
thisWord.rightID = wordObjs[i + 1]._id; | |
} else if (thisWord.last) { | |
thisWord.rightID = null; | |
thisWord.leftID = wordObjs[i - 1]._id; | |
} else { | |
thisWord.rightID = wordObjs[i + 1]._id; | |
thisWord.leftID = wordObjs[i - 1]._id; | |
} | |
} else { | |
thisWord.leftID = null; | |
thisWord.rightID = null; | |
} | |
} | |
// routes to addWordIDtoDoc | |
callback(null, masterObject, addObjectToDatabase); | |
} | |
var addWordIDtoDoc = function(err, data, callback) { | |
console.log("add word IDs to doc object"); | |
var masterObject = data; | |
var docObj = masterObject.docObj; | |
var wordObjs = masterObject.wordObjs; | |
// assign word id back to document obj array | |
for (var i = 0; i < wordObjs.length; i++) { | |
docObj.docWordIDArray[i] = wordObjs[i]._id; | |
} | |
// routes to addObjectToDatabase | |
callback(null, masterObject, confirmSave); | |
} | |
var addObjectToDatabase = function(err, data, callback) { | |
console.log("add objects to database"); | |
var masterObject = data; | |
var tokens = masterObject.tokens; | |
var uploadObj = masterObject.uploadObj; | |
var docObj = masterObject.docObj; | |
var sentObjs = masterObject.sentObjs; | |
var wordObjs = masterObject.wordObjs; | |
// add upload Object | |
uploadCollection.save(uploadObj, function(err, saved) { | |
if (err || !saved) console.log("Not saved"); | |
else console.log("Saved " + uploadObj._id); | |
}); | |
// add document | |
docCollection.save(docObj, function(err, saved) { | |
if (err || !saved) console.log("Not saved"); | |
else console.log("Saved " + docObj._id); | |
}); | |
// add sentences | |
for (var i = 0; i < sentObjs.length; i++) { | |
sentCollection.save(sentObjs[i], function(err, saved) { | |
if (err || !saved) { | |
console.log("Sentence not saved") | |
}; | |
}); | |
} | |
// add words | |
for (var k = 0; k < wordObjs.length; k++) { | |
wordCollection.save(wordObjs[k], function(err, saved) { | |
if (err || !saved) console.log("Word not saved"); | |
}); | |
} | |
// routes to temp | |
callback(null, masterObject); | |
} | |
var confirmSave = function(err, data) { | |
console.log("everything finished saving to database"); | |
// send back processed information as json response to user | |
processedContent = data; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment