This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// break string up in to sentences based on punctation and quotation marks | |
var tokens = text.match(/(?<=\s+|^)[\"\'\‘\“\'\"\[\(\{\⟨](.*?[.?!])(\s[.?!])*[\"\'\’\”\'\"\]\)\}\⟩](?=\s+|$)|(?<=\s+|^)\S(.*?[.?!])(\s[.?!])*(?=\s+|$)/g); | |
/* Breakdown of the regular expression | |
/ | |
(?<=\s+|^) // Lookbehind is whitespace or start-of-input FOLLOWED BY: | |
[\"\'\‘\“\'\"\[\(\{\⟨]? // Any brackets or quotation characters (optional) | |
(.*?[.?!]) // any sequence of characters followed by punctation | |
(\s[.?!])* // any combination of whitespace and punctuation | |
[\"\'\’\”\'\"\]\)\}\⟩]? // Any brackets or quotation characters (optional) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const NodeCarry = require('../src/index.js').NodeCarry; | |
const nc = new NodeCarry(); | |
const tests = [ | |
{ | |
input: "action", | |
output: "ac" | |
}, | |
{ | |
input: "acteur", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ 'number', | |
'wquctgtkut', | |
'number', | |
'wquctgtkut', | |
'number', | |
'wquctgtkut', | |
'o', | |
'join' ] | |
[ 'mail', 'an', 'empty', 'message', 'to', 'o', 'email', 'imkxepvqcp' ] | |
[ 'v', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var natural = require('./lib/natural'); | |
var data = fs.readFileSync('./data.txt', 'utf8'); | |
var sentenceTok = new natural.SentenceTokenizer(); | |
var aggressiveTok = new natural.AggressiveTokenizer(); | |
var sentences = sentenceTok.tokenize(data); | |
//console.log(sentences); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const _ = require("underscore")._; | |
const SkipNGrams = require('../lib/natural/ngrams/skip_ngrams'); | |
const sentence = 'insurgents killed in ongoing fighting'; | |
const two_skip_tri_grams = [ | |
['insurgents', 'killed', 'in'], | |
['insurgents', 'killed', 'ongoing'], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var inputFile = './spec/test_data/snowball_fr.txt'; | |
var outputFile = './spec/test_data/snowball_fr.json'; | |
var data = fs.readFileSync(inputFile, 'utf8'); | |
var lines = data.split(/[\n\r]+/); | |
var dict = {}; | |
lines.forEach(line => { | |
[word, stem] = line.split(/\s+/); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var inputFile = './spec/test_data/browntag_nolines_excerpt.txt'; | |
var outputFile = './spec/test_data/browntag_nolines_excerpt.json';; | |
var Corpus = require('../../lib/natural/brill_pos_tagger/lib/Corpus'); | |
var SentenceClass = require('../../lib/natural/brill_pos_tagger/lib/Sentence'); | |
var data = fs.readFileSync(inputFile, 'utf8'); | |
var corpus = new Corpus(data, 1, SentenceClass); | |
fs.writeFileSync(outputFile, JSON.stringify(corpus, null, 2)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var oldFile = './data/kata-dasar.txt'; | |
var data = fs.readFileSync(oldFile, 'utf-8'); | |
var words = data.split('\n'); | |
console.log(words.length); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require("fs"); | |
xml2js = require('xml2js'); | |
var Sentence = require('./Sentence'); | |
// Folder with corpus | |
var corpusPath = "/home/hugo/Workspace/FrenchTreeBank/originXML/current/corpus-constit/"; | |
// Documentation of the treebank: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var regExs = {"e-mail": /(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})/ig, | |
// Matches time of the form 19:20 | |
"time": /[0-9]{1,2}:[0-9][0-9]/g, | |
// This regular expressions matches dates of the form XX/XX/YYYY | |
// where XX can be 1 or 2 digits long and YYYY is always 4 digits long. | |
"date": /\d{1,2}\/\d{1,2}\/\d{4}/g, | |
"zipcode": /[0-9]{1,4}[A-Z]{2}/g, | |
// Matches http://210.50.2.215/sd_new/WebBuilder.cgi?RegID=7449046&First=Ok&Upt=Ok&EditPage=3&S | |
"uri": /\b([\d\w\.\/\+\-\?\:]*)((ht|f)tp(s|)\:\/\/|[\d\d\d|\d\d]\.[\d\d\d|\d\d]\.|www\.|\.tv|\.ac|\.com|\.edu|\.gov|\.int|\.mil|\.net|\.org|\.biz|\.info|\.name|\.pro|\.museum|\.co)([\d\w\.\/\%\+\-\=\&\?\:\\\"\'\,\|\~\;]*)\b/g |
NewerOlder