stephkoltun/processUpload.js

## processUpload.js
var processUpload = function(err, data, callback) {
    var file = data;
    var masterObject = {};

    var uploadObj = {
        _id: file.originalname.substring(0, 4),
        servPath: file.path,
        fileName: file.originalname,
        inputString: fs.readFileSync(file.path, "utf8"),
    };

    for (var i = 0; i < docMetadata.length; i++) {
        if (docMetadata[i].FileName == file.originalname) {
            var matchFile = docMetadata[i];
            console.log("matched file!");
            //console.log(matchFile);
            uploadObj.timestamp = new Date(Date.parse(matchFile.DateCreated));
            uploadObj.name = matchFile.FileName.substring(5).replace(".md", "");
            uploadObj.path = matchFile.FilePath.replace(/( - )|( )/g, "_");
            uploadObj.path = uploadObj.path.split(":");

            masterObject.uploadObj = uploadObj;

            break;
        } else {
            if (i == docMetadata.length - 1) {
                console.log("did not find match data for " + uploadObj.fileName);
            }
        }
    }

    // this routes to tokenizeInput
    callback(null, masterObject, tokenizeInput);
}


var stripString = function(err, data, callback) {
    var masterObject = data;
    var uploadObj = masterObject.uploadObj;
    console.log("cleaning string");

    var str = uploadObj.inputString.replace(/;(?=(\n)|( \n))/g, ".");
    str = str.replace(/;$/g, "."); // semi at end of line
    str = str.replace(/:(?=(\n))/g, "."); // colon then new line
    str = str.replace(/’(?=(\n))/g, "’."); // right quote then new line
    str = str.replace(/ (?=(\n))/g, "");
    str = str.replace(/_(?=(\n)|( \n))/g, "_.\n") // underscore then new line
    str = str.replace(/\b\n/g, ".\n"); // word boundary then new line
    str = str.replace(/\n\n/g, "\n");
    str = str.replace(/\n\n\n/g, "\n");
    str = str.replace(/\n\n\n\n/g, "\n");
    str = str.replace(/\t/g, "");
    str = contractions.expand(str);

    masterObject.cleanString = str;

    // this routes to tokenizeInput
    callback(null, masterObject, createDocObject);
};


var tokenizeInput = function(err, data, callback) {
    var masterObject = data;
    console.log("start tokenizing");

    var cleanString = masterObject.cleanString;

    // initialize Tokenizer
    var wordTokenizer = new natural.WordTokenizer();
    var sentenceTokenizer = new sentTokenizerOnly();
    var tokens = {};

    //tokens
    tokens.wordsTokens = wordTokenizer.tokenize(cleanString);
    sentenceTokenizer.setEntry(cleanString);
    tokens.sentTokens = sentenceTokenizer.getSentences();
    tokens.wordsPerSentTokens = [];

    for (i = 0; i < tokens.sentTokens.length; i++) {
        tokens.wordsPerSentTokens[i] = wordTokenizer.tokenize(tokens.sentTokens[i]);
    }

    masterObject.tokens = tokens;

    // this routes to createDocObject
    callback(null, masterObject, createSentObject);
}

var createDocObject = function(err, data, callback) {
    console.log("create doc object");
    var masterObject = data;
    var uploadObj = masterObject.uploadObj;
    var docObj = {};

    // inherited properties
    docObj.timestamp = uploadObj.timestamp;
    docObj.path = uploadObj.path;
    docObj.name = uploadObj.name;

    // unique properties
    docObj.dbPath = "documents";
    docObj._id = "d" + uploadObj._id; // add prefix
    docObj.docString = masterObject.cleanString;
    docObj.docSentIDArray = [];
    docObj.docWordIDArray = [];

    // geneology
    docObj.parentID = []; // complete after
    docObj.childID = []; // complete after


    // assign docObj to master for passing to callback
    masterObject.docObj = docObj;

    // this routes to createSentObject
    callback(null, masterObject, addAdjacentSents);
}

var createSentObject = function(err, data, callback) {
    console.log("create sentence objects");
    var masterObject = data;
    var uploadObj = masterObject.uploadObj;
    var tokens = masterObject.tokens;
    var docObj = masterObject.docObj;

    var sentences = tokens.sentTokens;
    var sentObjs = [];

    // process each sentence
    for (var i = 0; i < sentences.length; i++) {
        var thisSentence = {};

        // inherited
        thisSentence.timestamp = uploadObj.timestamp;
        thisSentence.path = uploadObj.path;
        thisSentence.name = uploadObj.name;
        thisSentence.docID = docObj._id;

        // unique
        thisSentence.dbPath = "sentences";
        thisSentence._id = "s" + uploadObj._id + i;

        // add sentence ID to document Object
        docObj.docSentIDArray[i] = thisSentence._id;

        // content
        thisSentence.sentString = sentences[i];
        thisSentence.sentWordIDArray = []; // add after


        // look for special characters
        // start with special character defaulting as false


        if (/^[#\?`'>\*~\|%:\\\[\+_-]/.test(thisSentence.sentString)) {
            thisSentence.specChar = true;
        } else {
            thisSentence.specChar = false;
        }

        // add any unique identifiers
        if (/^#/.test(thisSentence.sentString)) {
            thisSentence.pound = true;
        } else {
            thisSentence.pound = false;
        }

        if (/^\?/.test(thisSentence.sentString)) {
            thisSentence.quest = true;
        } else {
            thisSentence.quest = false;
        }

        if (/^`/.test(thisSentence.sentString)) {
            thisSentence.apos = true;
        } else {
            thisSentence.apos = false;
        }

        if (/^'/.test(thisSentence.sentString)) {
            thisSentence.quo = true;
        } else {
            thisSentence.quo = false;
        }

        if (/^>/.test(thisSentence.sentString)) {
            thisSentence.arrow = true;
        } else {
            thisSentence.arrow = false;
        }

        if (/^\*/.test(thisSentence.sentString)) {
            thisSentence.star = true;
        } else {
            thisSentence.star = false;
        }

        if (/^~/.test(thisSentence.sentString)) {
            thisSentence.squig = true;
        } else {
            thisSentence.squig = false;
        }

        if (/^\|/.test(thisSentence.sentString)) {
            thisSentence.vert = true;
        } else {
            thisSentence.vert = false;
        }

        if (/^%/.test(thisSentence.sentString)) {
            thisSentence.percent = true;
        } else {
            thisSentence.percent = false;
        }


        if (/^:/.test(thisSentence.sentString)) {
            thisSentence.colon = true;
        } else {
            thisSentence.colon = false;
        }


        if (/^\\/.test(thisSentence.sentString)) {
            thisSentence.slash = true;
        } else {
            thisSentence.slash = false;
        }

        if (/^\[/.test(thisSentence.sentString)) {
            thisSentence.square = true;
        } else {
            thisSentence.square = false;
        }

        if (/^\+/.test(thisSentence.sentString)) {
            thisSentence.plus = true;
        } else {
            thisSentence.plus = false;
        }


        if (/^_/.test(thisSentence.sentString)) {
            thisSentence.under = true;
        } else {
            thisSentence.under = false;
        }

        if (/^-/.test(thisSentence.sentString)) {
            thisSentence.dash = true;
        } else {
            thisSentence.dash = false;
        }


        // add clean string
        thisSentence.cleanString = thisSentence.sentString.replace(/^# /g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^\?\?)|(^\? )|(^\?)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^``)|(^` )|(^`)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^'')|(^' )|(^')/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^> )|(^>)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^\*\*)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^\* )|(^\*)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^~~)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^~ )|(^~)|/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^\|\| )/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^\| )|(^\|)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^%%)|(^% )|(^%)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^::)|(^: )|(^:)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^\\\\)|(^\\ )|(^\\)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^\[ )|(^\[)|(^\[\[)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^\+\+)|(^\+ )|(^\+)/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/_/g, "");
        thisSentence.cleanString = thisSentence.cleanString.replace(/(^- )|(^-)/g, "");


        // tokenize clean string
        var wordTokenizer = new natural.WordTokenizer();
        thisSentence.sentWordArray = wordTokenizer.tokenize(thisSentence.cleanString);


        // context
        thisSentence.leftID = null;
        thisSentence.rightID = null;
        // geneology
        thisSentence.parentID = []; // complete after
        thisSentence.childID = []; // complete after


        // sequence and position
        thisSentence.sentSequence = i;
        thisSentence.sentTotal = sentences.length - 1;
        if (i == 0) {
            thisSentence.first = true;
        } else {
            thisSentence.first = false;
        }

        if (i == sentences.length - 1) {
            thisSentence.last = true;
        } else {
            thisSentence.last = false;
        }
        // add sentence to array
        sentObjs[i] = thisSentence;
    }

    // assign sentence objects to master data object
    masterObject.sentObjs = sentObjs;

    // this routes to addAdjacentSents
    callback(null, masterObject, createWordObjs);
}

var addAdjacentSents = function(err, data, callback) {
    console.log("add adjacent sentence IDs");

    var masterObject = data;
    var sentObjs = masterObject.sentObjs;

    if (sentObjs.length > 1) {

        for (var i = 0; i < sentObjs.length; i++) {
            thisSent = sentObjs[i];

            if (i == 0) {
                thisSent.leftID = null;
                thisSent.rightID = sentObjs[i + 1]._id;
            } else if (i == sentObjs.length - 1) {
                thisSent.rightID = null;
                thisSent.leftID = sentObjs[sentObjs.length - 1]._id;
            } else {
                thisSent.rightID = sentObjs[i + 1]._id;
                thisSent.leftID = sentObjs[i - 1]._id;
            }
        }
    } else {
        sentObjs[0].leftID = null;
        sentObjs[0].rightID = null;
    }

    // routes to createWordObjs
    callback(null, masterObject, addAdjacentWords);
}


var createWordObjs = function(err, data, callback) {
    console.log("create word objs for each sentence");

    var masterObject = data;
    var tokens = masterObject.tokens;
    var uploadObj = masterObject.uploadObj;
    var docObj = masterObject.docObj;
    var sentObjs = masterObject.sentObjs;

    var allWords = [];

    for (var i = 0; i < sentObjs.length; i++) {
        var thisSent = sentObjs[i];
        var wordArray = thisSent.sentWordArray;

        // word objs for this sentence
        var wordObjs = [];

        // track num of words in prev sentence
        var prevSentLength = 0;
        var curSentNum = i;

        if (curSentNum > 0) {
            // all the previous lengths = nums less than i
            for (var prevSent = 0; prevSent < curSentNum; prevSent++) {
                prevSentLength += sentObjs[prevSent].sentWordArray.length;
            }
        }

        for (var k = 0; k < wordArray.length; k++) {
            var thisWord = {};

            // inherited
            thisWord.timestamp = uploadObj.timestamp;
            thisWord.path = uploadObj.path;
            thisWord.name = uploadObj.name;
            thisWord.docID = docObj._id;
            thisWord.sentID = thisSent._id;
            thisWord.sentSequence = thisSent.sentSequence;
            thisWord.sentTotal = thisSent.sentTotal;

            // unique
            thisWord.dbPath = "words";
            thisWord._id = "w" + thisSent._id.substring(1) + k;

            // assign back to sentence
            thisSent.sentWordIDArray[k] = (thisWord._id);

            // content
            thisWord.string = wordArray[k];

            // context
            thisWord.leftID = null;
            thisWord.rightID = null;
            // geneology
            thisWord.parentID = []; // complete after
            thisWord.childID = []; // complete after


            // sequence and position in sentence
            thisWord.wordSentSequence = k;
            thisWord.wordSentTotal = wordArray.length - 1;

            if (k == 0) {
                thisWord.first = true;
            } else {
                thisWord.first = false;
            }

            if (k == wordArray.length - 1) {
                thisWord.last = true;
            } else {
                thisWord.last = false;
            }

            // sequence in document
            thisWord.wordDocSequence = k + prevSentLength;
            thisWord.wordDocTotal = tokens.wordsTokens.length - 1;

            // add sentence to array
            wordObjs[k] = thisWord;
            allWords.push(thisWord);
        }
    }

    masterObject.wordObjs = allWords;

    // routes to addAdjacentWords
    callback(null, masterObject, addWordIDtoDoc);

}

var addAdjacentWords = function(err, data, callback) {
    console.log("add adjacent words");

    var masterObject = data;
    var wordObjs = masterObject.wordObjs;


    for (var i = 0; i < wordObjs.length; i++) {
        thisWord = wordObjs[i];

        // make sure this isn't the only word in the sentence
        if (thisWord.wordSentTotal > 0) {
            if (thisWord.first) {
                thisWord.leftID = null;
                thisWord.rightID = wordObjs[i + 1]._id;
            } else if (thisWord.last) {
                thisWord.rightID = null;
                thisWord.leftID = wordObjs[i - 1]._id;

            } else {
                thisWord.rightID = wordObjs[i + 1]._id;
                thisWord.leftID = wordObjs[i - 1]._id;
            }
        } else {
            thisWord.leftID = null;
            thisWord.rightID = null;
        }
    }


    // routes to addWordIDtoDoc
    callback(null, masterObject, addObjectToDatabase);

}

var addWordIDtoDoc = function(err, data, callback) {
    console.log("add word IDs to doc object");


    var masterObject = data;
    var docObj = masterObject.docObj;
    var wordObjs = masterObject.wordObjs;

    // assign word id back to document obj array
    for (var i = 0; i < wordObjs.length; i++) {
        docObj.docWordIDArray[i] = wordObjs[i]._id;
    }

    // routes to addObjectToDatabase
    callback(null, masterObject, confirmSave);

}

var addObjectToDatabase = function(err, data, callback) {
    console.log("add objects to database");

    var masterObject = data;
    var tokens = masterObject.tokens;
    var uploadObj = masterObject.uploadObj;
    var docObj = masterObject.docObj;
    var sentObjs = masterObject.sentObjs;
    var wordObjs = masterObject.wordObjs;

    // add upload Object
    uploadCollection.save(uploadObj, function(err, saved) {
        if (err || !saved) console.log("Not saved");
        else console.log("Saved " + uploadObj._id);
    });

    // add document
    docCollection.save(docObj, function(err, saved) {
        if (err || !saved) console.log("Not saved");
        else console.log("Saved " + docObj._id);
    });

    // add sentences
    for (var i = 0; i < sentObjs.length; i++) {
        sentCollection.save(sentObjs[i], function(err, saved) {
            if (err || !saved) {
                console.log("Sentence not saved")
            };
        });
    }

    // add words
    for (var k = 0; k < wordObjs.length; k++) {
        wordCollection.save(wordObjs[k], function(err, saved) {
            if (err || !saved) console.log("Word not saved");
        });
    }

    // routes to temp
    callback(null, masterObject);
}

var confirmSave = function(err, data) {
    console.log("everything finished saving to database");

    // send back processed information as json response to user
    processedContent = data;
}
	var processUpload = function(err, data, callback) {
	var file = data;
	var masterObject = {};

	var uploadObj = {
	_id: file.originalname.substring(0, 4),
	servPath: file.path,
	fileName: file.originalname,
	inputString: fs.readFileSync(file.path, "utf8"),
	};

	for (var i = 0; i < docMetadata.length; i++) {
	if (docMetadata[i].FileName == file.originalname) {
	var matchFile = docMetadata[i];
	console.log("matched file!");
	//console.log(matchFile);
	uploadObj.timestamp = new Date(Date.parse(matchFile.DateCreated));
	uploadObj.name = matchFile.FileName.substring(5).replace(".md", "");
	uploadObj.path = matchFile.FilePath.replace(/( - )\|( )/g, "_");
	uploadObj.path = uploadObj.path.split(":");

	masterObject.uploadObj = uploadObj;

	break;
	} else {
	if (i == docMetadata.length - 1) {
	console.log("did not find match data for " + uploadObj.fileName);
	}
	}
	}

	// this routes to tokenizeInput
	callback(null, masterObject, tokenizeInput);
	}


	var stripString = function(err, data, callback) {
	var masterObject = data;
	var uploadObj = masterObject.uploadObj;
	console.log("cleaning string");

	var str = uploadObj.inputString.replace(/;(?=(\n)\|( \n))/g, ".");
	str = str.replace(/;$/g, "."); // semi at end of line
	str = str.replace(/:(?=(\n))/g, "."); // colon then new line
	str = str.replace(/’(?=(\n))/g, "’."); // right quote then new line
	str = str.replace(/ (?=(\n))/g, "");
	str = str.replace(/_(?=(\n)\|( \n))/g, "_.\n") // underscore then new line
	str = str.replace(/\b\n/g, ".\n"); // word boundary then new line
	str = str.replace(/\n\n/g, "\n");
	str = str.replace(/\n\n\n/g, "\n");
	str = str.replace(/\n\n\n\n/g, "\n");
	str = str.replace(/\t/g, "");
	str = contractions.expand(str);

	masterObject.cleanString = str;

	// this routes to tokenizeInput
	callback(null, masterObject, createDocObject);
	};



	var tokenizeInput = function(err, data, callback) {
	var masterObject = data;
	console.log("start tokenizing");

	var cleanString = masterObject.cleanString;

	// initialize Tokenizer
	var wordTokenizer = new natural.WordTokenizer();
	var sentenceTokenizer = new sentTokenizerOnly();
	var tokens = {};

	//tokens
	tokens.wordsTokens = wordTokenizer.tokenize(cleanString);
	sentenceTokenizer.setEntry(cleanString);
	tokens.sentTokens = sentenceTokenizer.getSentences();
	tokens.wordsPerSentTokens = [];

	for (i = 0; i < tokens.sentTokens.length; i++) {
	tokens.wordsPerSentTokens[i] = wordTokenizer.tokenize(tokens.sentTokens[i]);
	}

	masterObject.tokens = tokens;

	// this routes to createDocObject
	callback(null, masterObject, createSentObject);
	}

	var createDocObject = function(err, data, callback) {
	console.log("create doc object");
	var masterObject = data;
	var uploadObj = masterObject.uploadObj;
	var docObj = {};

	// inherited properties
	docObj.timestamp = uploadObj.timestamp;
	docObj.path = uploadObj.path;
	docObj.name = uploadObj.name;

	// unique properties
	docObj.dbPath = "documents";
	docObj._id = "d" + uploadObj._id; // add prefix
	docObj.docString = masterObject.cleanString;
	docObj.docSentIDArray = [];
	docObj.docWordIDArray = [];

	// geneology
	docObj.parentID = []; // complete after
	docObj.childID = []; // complete after


	// assign docObj to master for passing to callback
	masterObject.docObj = docObj;

	// this routes to createSentObject
	callback(null, masterObject, addAdjacentSents);
	}

	var createSentObject = function(err, data, callback) {
	console.log("create sentence objects");
	var masterObject = data;
	var uploadObj = masterObject.uploadObj;
	var tokens = masterObject.tokens;
	var docObj = masterObject.docObj;

	var sentences = tokens.sentTokens;
	var sentObjs = [];

	// process each sentence
	for (var i = 0; i < sentences.length; i++) {
	var thisSentence = {};

	// inherited
	thisSentence.timestamp = uploadObj.timestamp;
	thisSentence.path = uploadObj.path;
	thisSentence.name = uploadObj.name;
	thisSentence.docID = docObj._id;

	// unique
	thisSentence.dbPath = "sentences";
	thisSentence._id = "s" + uploadObj._id + i;

	// add sentence ID to document Object
	docObj.docSentIDArray[i] = thisSentence._id;

	// content
	thisSentence.sentString = sentences[i];
	thisSentence.sentWordIDArray = []; // add after


	// look for special characters
	// start with special character defaulting as false


	if (/^[#\?`'>\*~\\|%:\\\[\+_-]/.test(thisSentence.sentString)) {
	thisSentence.specChar = true;
	} else {
	thisSentence.specChar = false;
	}

	// add any unique identifiers
	if (/^#/.test(thisSentence.sentString)) {
	thisSentence.pound = true;
	} else {
	thisSentence.pound = false;
	}

	if (/^\?/.test(thisSentence.sentString)) {
	thisSentence.quest = true;
	} else {
	thisSentence.quest = false;
	}

	if (/^`/.test(thisSentence.sentString)) {
	thisSentence.apos = true;
	} else {
	thisSentence.apos = false;
	}

	if (/^'/.test(thisSentence.sentString)) {
	thisSentence.quo = true;
	} else {
	thisSentence.quo = false;
	}

	if (/^>/.test(thisSentence.sentString)) {
	thisSentence.arrow = true;
	} else {
	thisSentence.arrow = false;
	}

	if (/^\*/.test(thisSentence.sentString)) {
	thisSentence.star = true;
	} else {
	thisSentence.star = false;
	}

	if (/^~/.test(thisSentence.sentString)) {
	thisSentence.squig = true;
	} else {
	thisSentence.squig = false;
	}

	if (/^\\|/.test(thisSentence.sentString)) {
	thisSentence.vert = true;
	} else {
	thisSentence.vert = false;
	}

	if (/^%/.test(thisSentence.sentString)) {
	thisSentence.percent = true;
	} else {
	thisSentence.percent = false;
	}


	if (/^:/.test(thisSentence.sentString)) {
	thisSentence.colon = true;
	} else {
	thisSentence.colon = false;
	}


	if (/^\\/.test(thisSentence.sentString)) {
	thisSentence.slash = true;
	} else {
	thisSentence.slash = false;
	}

	if (/^\[/.test(thisSentence.sentString)) {
	thisSentence.square = true;
	} else {
	thisSentence.square = false;
	}

	if (/^\+/.test(thisSentence.sentString)) {
	thisSentence.plus = true;
	} else {
	thisSentence.plus = false;
	}


	if (/^_/.test(thisSentence.sentString)) {
	thisSentence.under = true;
	} else {
	thisSentence.under = false;
	}

	if (/^-/.test(thisSentence.sentString)) {
	thisSentence.dash = true;
	} else {
	thisSentence.dash = false;
	}


	// add clean string
	thisSentence.cleanString = thisSentence.sentString.replace(/^# /g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^\?\?)\|(^\? )\|(^\?)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^``)\|(^` )\|(^`)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^'')\|(^' )\|(^')/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^> )\|(^>)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^\\)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^\* )\|(^\*)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^~~)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^~ )\|(^~)\|/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^\\|\\| )/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^\\| )\|(^\\|)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^%%)\|(^% )\|(^%)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^::)\|(^: )\|(^:)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^\\\\)\|(^\\ )\|(^\\)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^\[ )\|(^\[)\|(^\[\[)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^\+\+)\|(^\+ )\|(^\+)/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/_/g, "");
	thisSentence.cleanString = thisSentence.cleanString.replace(/(^- )\|(^-)/g, "");


	// tokenize clean string
	var wordTokenizer = new natural.WordTokenizer();
	thisSentence.sentWordArray = wordTokenizer.tokenize(thisSentence.cleanString);


	// context
	thisSentence.leftID = null;
	thisSentence.rightID = null;
	// geneology
	thisSentence.parentID = []; // complete after
	thisSentence.childID = []; // complete after


	// sequence and position
	thisSentence.sentSequence = i;
	thisSentence.sentTotal = sentences.length - 1;
	if (i == 0) {
	thisSentence.first = true;
	} else {
	thisSentence.first = false;
	}

	if (i == sentences.length - 1) {
	thisSentence.last = true;
	} else {
	thisSentence.last = false;
	}
	// add sentence to array
	sentObjs[i] = thisSentence;
	}

	// assign sentence objects to master data object
	masterObject.sentObjs = sentObjs;

	// this routes to addAdjacentSents
	callback(null, masterObject, createWordObjs);
	}

	var addAdjacentSents = function(err, data, callback) {
	console.log("add adjacent sentence IDs");

	var masterObject = data;
	var sentObjs = masterObject.sentObjs;

	if (sentObjs.length > 1) {

	for (var i = 0; i < sentObjs.length; i++) {
	thisSent = sentObjs[i];

	if (i == 0) {
	thisSent.leftID = null;
	thisSent.rightID = sentObjs[i + 1]._id;
	} else if (i == sentObjs.length - 1) {
	thisSent.rightID = null;
	thisSent.leftID = sentObjs[sentObjs.length - 1]._id;
	} else {
	thisSent.rightID = sentObjs[i + 1]._id;
	thisSent.leftID = sentObjs[i - 1]._id;
	}
	}
	} else {
	sentObjs[0].leftID = null;
	sentObjs[0].rightID = null;
	}

	// routes to createWordObjs
	callback(null, masterObject, addAdjacentWords);
	}


	var createWordObjs = function(err, data, callback) {
	console.log("create word objs for each sentence");

	var masterObject = data;
	var tokens = masterObject.tokens;
	var uploadObj = masterObject.uploadObj;
	var docObj = masterObject.docObj;
	var sentObjs = masterObject.sentObjs;

	var allWords = [];

	for (var i = 0; i < sentObjs.length; i++) {
	var thisSent = sentObjs[i];
	var wordArray = thisSent.sentWordArray;

	// word objs for this sentence
	var wordObjs = [];

	// track num of words in prev sentence
	var prevSentLength = 0;
	var curSentNum = i;

	if (curSentNum > 0) {
	// all the previous lengths = nums less than i
	for (var prevSent = 0; prevSent < curSentNum; prevSent++) {
	prevSentLength += sentObjs[prevSent].sentWordArray.length;
	}
	}

	for (var k = 0; k < wordArray.length; k++) {
	var thisWord = {};

	// inherited
	thisWord.timestamp = uploadObj.timestamp;
	thisWord.path = uploadObj.path;
	thisWord.name = uploadObj.name;
	thisWord.docID = docObj._id;
	thisWord.sentID = thisSent._id;
	thisWord.sentSequence = thisSent.sentSequence;
	thisWord.sentTotal = thisSent.sentTotal;

	// unique
	thisWord.dbPath = "words";
	thisWord._id = "w" + thisSent._id.substring(1) + k;

	// assign back to sentence
	thisSent.sentWordIDArray[k] = (thisWord._id);

	// content
	thisWord.string = wordArray[k];

	// context
	thisWord.leftID = null;
	thisWord.rightID = null;
	// geneology
	thisWord.parentID = []; // complete after
	thisWord.childID = []; // complete after


	// sequence and position in sentence
	thisWord.wordSentSequence = k;
	thisWord.wordSentTotal = wordArray.length - 1;

	if (k == 0) {
	thisWord.first = true;
	} else {
	thisWord.first = false;
	}

	if (k == wordArray.length - 1) {
	thisWord.last = true;
	} else {
	thisWord.last = false;
	}

	// sequence in document
	thisWord.wordDocSequence = k + prevSentLength;
	thisWord.wordDocTotal = tokens.wordsTokens.length - 1;

	// add sentence to array
	wordObjs[k] = thisWord;
	allWords.push(thisWord);
	}
	}

	masterObject.wordObjs = allWords;

	// routes to addAdjacentWords
	callback(null, masterObject, addWordIDtoDoc);

	}

	var addAdjacentWords = function(err, data, callback) {
	console.log("add adjacent words");

	var masterObject = data;
	var wordObjs = masterObject.wordObjs;


	for (var i = 0; i < wordObjs.length; i++) {
	thisWord = wordObjs[i];

	// make sure this isn't the only word in the sentence
	if (thisWord.wordSentTotal > 0) {
	if (thisWord.first) {
	thisWord.leftID = null;
	thisWord.rightID = wordObjs[i + 1]._id;
	} else if (thisWord.last) {
	thisWord.rightID = null;
	thisWord.leftID = wordObjs[i - 1]._id;

	} else {
	thisWord.rightID = wordObjs[i + 1]._id;
	thisWord.leftID = wordObjs[i - 1]._id;
	}
	} else {
	thisWord.leftID = null;
	thisWord.rightID = null;
	}
	}


	// routes to addWordIDtoDoc
	callback(null, masterObject, addObjectToDatabase);

	}

	var addWordIDtoDoc = function(err, data, callback) {
	console.log("add word IDs to doc object");


	var masterObject = data;
	var docObj = masterObject.docObj;
	var wordObjs = masterObject.wordObjs;

	// assign word id back to document obj array
	for (var i = 0; i < wordObjs.length; i++) {
	docObj.docWordIDArray[i] = wordObjs[i]._id;
	}

	// routes to addObjectToDatabase
	callback(null, masterObject, confirmSave);

	}

	var addObjectToDatabase = function(err, data, callback) {
	console.log("add objects to database");

	var masterObject = data;
	var tokens = masterObject.tokens;
	var uploadObj = masterObject.uploadObj;
	var docObj = masterObject.docObj;
	var sentObjs = masterObject.sentObjs;
	var wordObjs = masterObject.wordObjs;

	// add upload Object
	uploadCollection.save(uploadObj, function(err, saved) {
	if (err \|\| !saved) console.log("Not saved");
	else console.log("Saved " + uploadObj._id);
	});

	// add document
	docCollection.save(docObj, function(err, saved) {
	if (err \|\| !saved) console.log("Not saved");
	else console.log("Saved " + docObj._id);
	});

	// add sentences
	for (var i = 0; i < sentObjs.length; i++) {
	sentCollection.save(sentObjs[i], function(err, saved) {
	if (err \|\| !saved) {
	console.log("Sentence not saved")
	};
	});
	}

	// add words
	for (var k = 0; k < wordObjs.length; k++) {
	wordCollection.save(wordObjs[k], function(err, saved) {
	if (err \|\| !saved) console.log("Word not saved");
	});
	}

	// routes to temp
	callback(null, masterObject);
	}

	var confirmSave = function(err, data) {
	console.log("everything finished saving to database");

	// send back processed information as json response to user
	processedContent = data;
	}