This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Usage: | |
// transformationRules = new BrillTransformationRules(); | |
// transformationRules.rules.forEach(function(ruleFunction) { | |
// ruleFunction(taggedSentence, i); | |
// }); | |
// where taggedSentence is an array of arrays of the form: | |
// [[the, DET], [red, JJ], [book, NN]] and i the position to be processed | |
function BrillTransformationRules() { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Error: abandonadas Expected: abandon Got: abandonad | |
Error: abba Expected: abba Got: abb | |
Error: abdicación Expected: abdic Got: abd | |
Error: abismo Expected: abism Got: ab | |
Error: abluciones Expected: ablucion Got: ablu | |
Error: abogadas Expected: abog Got: abogad | |
Error: abordaban Expected: abord Got: abordab | |
Error: abordadas Expected: abord Got: abordad | |
Error: abra Expected: abra Got: abr | |
Error: abre Expected: abre Got: abr |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright (c) 2014, Luís Rodrigues | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var regExs = {"e-mail": /(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})/ig, | |
// Matches time of the form 19:20 | |
"time": /[0-9]{1,2}:[0-9][0-9]/g, | |
// This regular expressions matches dates of the form XX/XX/YYYY | |
// where XX can be 1 or 2 digits long and YYYY is always 4 digits long. | |
"date": /\d{1,2}\/\d{1,2}\/\d{4}/g, | |
"zipcode": /[0-9]{1,4}[A-Z]{2}/g, | |
// Matches http://210.50.2.215/sd_new/WebBuilder.cgi?RegID=7449046&First=Ok&Upt=Ok&EditPage=3&S | |
"uri": /\b([\d\w\.\/\+\-\?\:]*)((ht|f)tp(s|)\:\/\/|[\d\d\d|\d\d]\.[\d\d\d|\d\d]\.|www\.|\.tv|\.ac|\.com|\.edu|\.gov|\.int|\.mil|\.net|\.org|\.biz|\.info|\.name|\.pro|\.museum|\.co)([\d\w\.\/\%\+\-\=\&\?\:\\\"\'\,\|\~\;]*)\b/g |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require("fs"); | |
xml2js = require('xml2js'); | |
var Sentence = require('./Sentence'); | |
// Folder with corpus | |
var corpusPath = "/home/hugo/Workspace/FrenchTreeBank/originXML/current/corpus-constit/"; | |
// Documentation of the treebank: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var oldFile = './data/kata-dasar.txt'; | |
var data = fs.readFileSync(oldFile, 'utf-8'); | |
var words = data.split('\n'); | |
console.log(words.length); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var inputFile = './spec/test_data/browntag_nolines_excerpt.txt'; | |
var outputFile = './spec/test_data/browntag_nolines_excerpt.json';; | |
var Corpus = require('../../lib/natural/brill_pos_tagger/lib/Corpus'); | |
var SentenceClass = require('../../lib/natural/brill_pos_tagger/lib/Sentence'); | |
var data = fs.readFileSync(inputFile, 'utf8'); | |
var corpus = new Corpus(data, 1, SentenceClass); | |
fs.writeFileSync(outputFile, JSON.stringify(corpus, null, 2)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var inputFile = './spec/test_data/snowball_fr.txt'; | |
var outputFile = './spec/test_data/snowball_fr.json'; | |
var data = fs.readFileSync(inputFile, 'utf8'); | |
var lines = data.split(/[\n\r]+/); | |
var dict = {}; | |
lines.forEach(line => { | |
[word, stem] = line.split(/\s+/); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const _ = require("underscore")._; | |
const SkipNGrams = require('../lib/natural/ngrams/skip_ngrams'); | |
const sentence = 'insurgents killed in ongoing fighting'; | |
const two_skip_tri_grams = [ | |
['insurgents', 'killed', 'in'], | |
['insurgents', 'killed', 'ongoing'], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var natural = require('./lib/natural'); | |
var data = fs.readFileSync('./data.txt', 'utf8'); | |
var sentenceTok = new natural.SentenceTokenizer(); | |
var aggressiveTok = new natural.AggressiveTokenizer(); | |
var sentences = sentenceTok.tokenize(data); | |
//console.log(sentences); |
OlderNewer