Skip to content

Instantly share code, notes, and snippets.

@stephkoltun
Created April 10, 2017 13:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stephkoltun/d3312d609c44f1e4008a150881c19230 to your computer and use it in GitHub Desktop.
Save stephkoltun/d3312d609c44f1e4008a150881c19230 to your computer and use it in GitHub Desktop.
Atomizing text prior to uploading it to a database as various objects
var processUpload = function(err, data, callback) {
var file = data;
var masterObject = {};
var uploadObj = {
_id: file.originalname.substring(0, 4),
servPath: file.path,
fileName: file.originalname,
inputString: fs.readFileSync(file.path, "utf8"),
};
for (var i = 0; i < docMetadata.length; i++) {
if (docMetadata[i].FileName == file.originalname) {
var matchFile = docMetadata[i];
console.log("matched file!");
//console.log(matchFile);
uploadObj.timestamp = new Date(Date.parse(matchFile.DateCreated));
uploadObj.name = matchFile.FileName.substring(5).replace(".md", "");
uploadObj.path = matchFile.FilePath.replace(/( - )|( )/g, "_");
uploadObj.path = uploadObj.path.split(":");
masterObject.uploadObj = uploadObj;
break;
} else {
if (i == docMetadata.length - 1) {
console.log("did not find match data for " + uploadObj.fileName);
}
}
}
// this routes to tokenizeInput
callback(null, masterObject, tokenizeInput);
}
var stripString = function(err, data, callback) {
var masterObject = data;
var uploadObj = masterObject.uploadObj;
console.log("cleaning string");
var str = uploadObj.inputString.replace(/;(?=(\n)|( \n))/g, ".");
str = str.replace(/;$/g, "."); // semi at end of line
str = str.replace(/:(?=(\n))/g, "."); // colon then new line
str = str.replace(/’(?=(\n))/g, "’."); // right quote then new line
str = str.replace(/ (?=(\n))/g, "");
str = str.replace(/_(?=(\n)|( \n))/g, "_.\n") // underscore then new line
str = str.replace(/\b\n/g, ".\n"); // word boundary then new line
str = str.replace(/\n\n/g, "\n");
str = str.replace(/\n\n\n/g, "\n");
str = str.replace(/\n\n\n\n/g, "\n");
str = str.replace(/\t/g, "");
str = contractions.expand(str);
masterObject.cleanString = str;
// this routes to tokenizeInput
callback(null, masterObject, createDocObject);
};
var tokenizeInput = function(err, data, callback) {
var masterObject = data;
console.log("start tokenizing");
var cleanString = masterObject.cleanString;
// initialize Tokenizer
var wordTokenizer = new natural.WordTokenizer();
var sentenceTokenizer = new sentTokenizerOnly();
var tokens = {};
//tokens
tokens.wordsTokens = wordTokenizer.tokenize(cleanString);
sentenceTokenizer.setEntry(cleanString);
tokens.sentTokens = sentenceTokenizer.getSentences();
tokens.wordsPerSentTokens = [];
for (i = 0; i < tokens.sentTokens.length; i++) {
tokens.wordsPerSentTokens[i] = wordTokenizer.tokenize(tokens.sentTokens[i]);
}
masterObject.tokens = tokens;
// this routes to createDocObject
callback(null, masterObject, createSentObject);
}
var createDocObject = function(err, data, callback) {
console.log("create doc object");
var masterObject = data;
var uploadObj = masterObject.uploadObj;
var docObj = {};
// inherited properties
docObj.timestamp = uploadObj.timestamp;
docObj.path = uploadObj.path;
docObj.name = uploadObj.name;
// unique properties
docObj.dbPath = "documents";
docObj._id = "d" + uploadObj._id; // add prefix
docObj.docString = masterObject.cleanString;
docObj.docSentIDArray = [];
docObj.docWordIDArray = [];
// geneology
docObj.parentID = []; // complete after
docObj.childID = []; // complete after
// assign docObj to master for passing to callback
masterObject.docObj = docObj;
// this routes to createSentObject
callback(null, masterObject, addAdjacentSents);
}
var createSentObject = function(err, data, callback) {
console.log("create sentence objects");
var masterObject = data;
var uploadObj = masterObject.uploadObj;
var tokens = masterObject.tokens;
var docObj = masterObject.docObj;
var sentences = tokens.sentTokens;
var sentObjs = [];
// process each sentence
for (var i = 0; i < sentences.length; i++) {
var thisSentence = {};
// inherited
thisSentence.timestamp = uploadObj.timestamp;
thisSentence.path = uploadObj.path;
thisSentence.name = uploadObj.name;
thisSentence.docID = docObj._id;
// unique
thisSentence.dbPath = "sentences";
thisSentence._id = "s" + uploadObj._id + i;
// add sentence ID to document Object
docObj.docSentIDArray[i] = thisSentence._id;
// content
thisSentence.sentString = sentences[i];
thisSentence.sentWordIDArray = []; // add after
// look for special characters
// start with special character defaulting as false
if (/^[#\?`'>\*~\|%:\\\[\+_-]/.test(thisSentence.sentString)) {
thisSentence.specChar = true;
} else {
thisSentence.specChar = false;
}
// add any unique identifiers
if (/^#/.test(thisSentence.sentString)) {
thisSentence.pound = true;
} else {
thisSentence.pound = false;
}
if (/^\?/.test(thisSentence.sentString)) {
thisSentence.quest = true;
} else {
thisSentence.quest = false;
}
if (/^`/.test(thisSentence.sentString)) {
thisSentence.apos = true;
} else {
thisSentence.apos = false;
}
if (/^'/.test(thisSentence.sentString)) {
thisSentence.quo = true;
} else {
thisSentence.quo = false;
}
if (/^>/.test(thisSentence.sentString)) {
thisSentence.arrow = true;
} else {
thisSentence.arrow = false;
}
if (/^\*/.test(thisSentence.sentString)) {
thisSentence.star = true;
} else {
thisSentence.star = false;
}
if (/^~/.test(thisSentence.sentString)) {
thisSentence.squig = true;
} else {
thisSentence.squig = false;
}
if (/^\|/.test(thisSentence.sentString)) {
thisSentence.vert = true;
} else {
thisSentence.vert = false;
}
if (/^%/.test(thisSentence.sentString)) {
thisSentence.percent = true;
} else {
thisSentence.percent = false;
}
if (/^:/.test(thisSentence.sentString)) {
thisSentence.colon = true;
} else {
thisSentence.colon = false;
}
if (/^\\/.test(thisSentence.sentString)) {
thisSentence.slash = true;
} else {
thisSentence.slash = false;
}
if (/^\[/.test(thisSentence.sentString)) {
thisSentence.square = true;
} else {
thisSentence.square = false;
}
if (/^\+/.test(thisSentence.sentString)) {
thisSentence.plus = true;
} else {
thisSentence.plus = false;
}
if (/^_/.test(thisSentence.sentString)) {
thisSentence.under = true;
} else {
thisSentence.under = false;
}
if (/^-/.test(thisSentence.sentString)) {
thisSentence.dash = true;
} else {
thisSentence.dash = false;
}
// add clean string
thisSentence.cleanString = thisSentence.sentString.replace(/^# /g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\?\?)|(^\? )|(^\?)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^``)|(^` )|(^`)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^'')|(^' )|(^')/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^> )|(^>)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\*\*)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\* )|(^\*)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^~~)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^~ )|(^~)|/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\|\| )/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\| )|(^\|)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^%%)|(^% )|(^%)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^::)|(^: )|(^:)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\\\\)|(^\\ )|(^\\)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\[ )|(^\[)|(^\[\[)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^\+\+)|(^\+ )|(^\+)/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/_/g, "");
thisSentence.cleanString = thisSentence.cleanString.replace(/(^- )|(^-)/g, "");
// tokenize clean string
var wordTokenizer = new natural.WordTokenizer();
thisSentence.sentWordArray = wordTokenizer.tokenize(thisSentence.cleanString);
// context
thisSentence.leftID = null;
thisSentence.rightID = null;
// geneology
thisSentence.parentID = []; // complete after
thisSentence.childID = []; // complete after
// sequence and position
thisSentence.sentSequence = i;
thisSentence.sentTotal = sentences.length - 1;
if (i == 0) {
thisSentence.first = true;
} else {
thisSentence.first = false;
}
if (i == sentences.length - 1) {
thisSentence.last = true;
} else {
thisSentence.last = false;
}
// add sentence to array
sentObjs[i] = thisSentence;
}
// assign sentence objects to master data object
masterObject.sentObjs = sentObjs;
// this routes to addAdjacentSents
callback(null, masterObject, createWordObjs);
}
var addAdjacentSents = function(err, data, callback) {
console.log("add adjacent sentence IDs");
var masterObject = data;
var sentObjs = masterObject.sentObjs;
if (sentObjs.length > 1) {
for (var i = 0; i < sentObjs.length; i++) {
thisSent = sentObjs[i];
if (i == 0) {
thisSent.leftID = null;
thisSent.rightID = sentObjs[i + 1]._id;
} else if (i == sentObjs.length - 1) {
thisSent.rightID = null;
thisSent.leftID = sentObjs[sentObjs.length - 1]._id;
} else {
thisSent.rightID = sentObjs[i + 1]._id;
thisSent.leftID = sentObjs[i - 1]._id;
}
}
} else {
sentObjs[0].leftID = null;
sentObjs[0].rightID = null;
}
// routes to createWordObjs
callback(null, masterObject, addAdjacentWords);
}
var createWordObjs = function(err, data, callback) {
console.log("create word objs for each sentence");
var masterObject = data;
var tokens = masterObject.tokens;
var uploadObj = masterObject.uploadObj;
var docObj = masterObject.docObj;
var sentObjs = masterObject.sentObjs;
var allWords = [];
for (var i = 0; i < sentObjs.length; i++) {
var thisSent = sentObjs[i];
var wordArray = thisSent.sentWordArray;
// word objs for this sentence
var wordObjs = [];
// track num of words in prev sentence
var prevSentLength = 0;
var curSentNum = i;
if (curSentNum > 0) {
// all the previous lengths = nums less than i
for (var prevSent = 0; prevSent < curSentNum; prevSent++) {
prevSentLength += sentObjs[prevSent].sentWordArray.length;
}
}
for (var k = 0; k < wordArray.length; k++) {
var thisWord = {};
// inherited
thisWord.timestamp = uploadObj.timestamp;
thisWord.path = uploadObj.path;
thisWord.name = uploadObj.name;
thisWord.docID = docObj._id;
thisWord.sentID = thisSent._id;
thisWord.sentSequence = thisSent.sentSequence;
thisWord.sentTotal = thisSent.sentTotal;
// unique
thisWord.dbPath = "words";
thisWord._id = "w" + thisSent._id.substring(1) + k;
// assign back to sentence
thisSent.sentWordIDArray[k] = (thisWord._id);
// content
thisWord.string = wordArray[k];
// context
thisWord.leftID = null;
thisWord.rightID = null;
// geneology
thisWord.parentID = []; // complete after
thisWord.childID = []; // complete after
// sequence and position in sentence
thisWord.wordSentSequence = k;
thisWord.wordSentTotal = wordArray.length - 1;
if (k == 0) {
thisWord.first = true;
} else {
thisWord.first = false;
}
if (k == wordArray.length - 1) {
thisWord.last = true;
} else {
thisWord.last = false;
}
// sequence in document
thisWord.wordDocSequence = k + prevSentLength;
thisWord.wordDocTotal = tokens.wordsTokens.length - 1;
// add sentence to array
wordObjs[k] = thisWord;
allWords.push(thisWord);
}
}
masterObject.wordObjs = allWords;
// routes to addAdjacentWords
callback(null, masterObject, addWordIDtoDoc);
}
var addAdjacentWords = function(err, data, callback) {
console.log("add adjacent words");
var masterObject = data;
var wordObjs = masterObject.wordObjs;
for (var i = 0; i < wordObjs.length; i++) {
thisWord = wordObjs[i];
// make sure this isn't the only word in the sentence
if (thisWord.wordSentTotal > 0) {
if (thisWord.first) {
thisWord.leftID = null;
thisWord.rightID = wordObjs[i + 1]._id;
} else if (thisWord.last) {
thisWord.rightID = null;
thisWord.leftID = wordObjs[i - 1]._id;
} else {
thisWord.rightID = wordObjs[i + 1]._id;
thisWord.leftID = wordObjs[i - 1]._id;
}
} else {
thisWord.leftID = null;
thisWord.rightID = null;
}
}
// routes to addWordIDtoDoc
callback(null, masterObject, addObjectToDatabase);
}
var addWordIDtoDoc = function(err, data, callback) {
console.log("add word IDs to doc object");
var masterObject = data;
var docObj = masterObject.docObj;
var wordObjs = masterObject.wordObjs;
// assign word id back to document obj array
for (var i = 0; i < wordObjs.length; i++) {
docObj.docWordIDArray[i] = wordObjs[i]._id;
}
// routes to addObjectToDatabase
callback(null, masterObject, confirmSave);
}
var addObjectToDatabase = function(err, data, callback) {
console.log("add objects to database");
var masterObject = data;
var tokens = masterObject.tokens;
var uploadObj = masterObject.uploadObj;
var docObj = masterObject.docObj;
var sentObjs = masterObject.sentObjs;
var wordObjs = masterObject.wordObjs;
// add upload Object
uploadCollection.save(uploadObj, function(err, saved) {
if (err || !saved) console.log("Not saved");
else console.log("Saved " + uploadObj._id);
});
// add document
docCollection.save(docObj, function(err, saved) {
if (err || !saved) console.log("Not saved");
else console.log("Saved " + docObj._id);
});
// add sentences
for (var i = 0; i < sentObjs.length; i++) {
sentCollection.save(sentObjs[i], function(err, saved) {
if (err || !saved) {
console.log("Sentence not saved")
};
});
}
// add words
for (var k = 0; k < wordObjs.length; k++) {
wordCollection.save(wordObjs[k], function(err, saved) {
if (err || !saved) console.log("Word not saved");
});
}
// routes to temp
callback(null, masterObject);
}
var confirmSave = function(err, data) {
console.log("everything finished saving to database");
// send back processed information as json response to user
processedContent = data;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment