Hugo ter Doest Hugo-ter-Doest

## BrillTransformationRulesAsFunctions

// Usage:
// transformationRules = new BrillTransformationRules();
// transformationRules.rules.forEach(function(ruleFunction) {
//   ruleFunction(taggedSentence, i);
// });
// where taggedSentence is an array of arrays of the form:
// [[the, DET], [red, JJ], [book, NN]] and i the position to be processed

function BrillTransformationRules() {

## PorterStemmerSpanish.log
Error: abandonadas Expected: abandon Got: abandonad
Error: abba Expected: abba Got: abb
Error: abdicación Expected: abdic Got: abd
Error: abismo Expected: abism Got: ab
Error: abluciones Expected: ablucion Got: ablu
Error: abogadas Expected: abog Got: abogad
Error: abordaban Expected: abord Got: abordab
Error: abordadas Expected: abord Got: abordad
Error: abra Expected: abra Got: abr
Error: abre Expected: abre Got: abr

## porter_stemmer_es_spec.js
/*
Copyright (c) 2014, Luís Rodrigues

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

## NER_RegExp.js

var regExs = {"e-mail": /(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})/ig,
              // Matches time of the form 19:20
              "time": /[0-9]{1,2}:[0-9][0-9]/g,
              // This regular expressions matches dates of the form XX/XX/YYYY
              // where XX can be 1 or 2 digits long and YYYY is always 4 digits long.
              "date": /\d{1,2}\/\d{1,2}\/\d{4}/g,
              "zipcode": /[0-9]{1,4}[A-Z]{2}/g,
              // Matches http://210.50.2.215/sd_new/WebBuilder.cgi?RegID=7449046&amp;First=Ok&amp;Upt=Ok&amp;EditPage=3&amp;S
              "uri": /\b([\d\w\.\/\+\-\?\:]*)((ht|f)tp(s|)\:\/\/|[\d\d\d|\d\d]\.[\d\d\d|\d\d]\.|www\.|\.tv|\.ac|\.com|\.edu|\.gov|\.int|\.mil|\.net|\.org|\.biz|\.info|\.name|\.pro|\.museum|\.co)([\d\w\.\/\%\+\-\=\&amp;\?\:\\\&quot;\'\,\|\~\;]*)\b/g

## processFTB.js

var fs = require("fs");
xml2js = require('xml2js');
var Sentence = require('./Sentence');

// Folder with corpus
var corpusPath = "/home/hugo/Workspace/FrenchTreeBank/originXML/current/corpus-constit/";


// Documentation of the treebank:

## convertFileIndondesian.js
var fs = require('fs');

var oldFile = './data/kata-dasar.txt';

var data = fs.readFileSync(oldFile, 'utf-8');

var words = data.split('\n');

console.log(words.length);

## ConvertBrownCorpus.js
var fs = require('fs');
var inputFile = './spec/test_data/browntag_nolines_excerpt.txt';
var outputFile = './spec/test_data/browntag_nolines_excerpt.json';;

var Corpus = require('../../lib/natural/brill_pos_tagger/lib/Corpus');
var SentenceClass = require('../../lib/natural/brill_pos_tagger/lib/Sentence');

var data = fs.readFileSync(inputFile, 'utf8');
var corpus = new Corpus(data, 1, SentenceClass);
fs.writeFileSync(outputFile, JSON.stringify(corpus, null, 2));

## ConvertSnowball.js
var fs = require('fs');
var inputFile = './spec/test_data/snowball_fr.txt';
var outputFile = './spec/test_data/snowball_fr.json';

var data = fs.readFileSync(inputFile, 'utf8');

var lines = data.split(/[\n\r]+/);
var dict = {};
lines.forEach(line => {
  [word, stem] = line.split(/\s+/);

## gist:5099fd57abf512a1162248fb965d4d90

const _ = require("underscore")._;

const SkipNGrams = require('../lib/natural/ngrams/skip_ngrams');

const sentence = 'insurgents killed in ongoing fighting';

const two_skip_tri_grams = [
  ['insurgents', 'killed', 'in'],
  ['insurgents', 'killed', 'ongoing'],

## test-e-mail-tokenizer.js
var fs = require('fs');
var natural = require('./lib/natural');

var data = fs.readFileSync('./data.txt', 'utf8');

var sentenceTok = new natural.SentenceTokenizer();
var aggressiveTok = new natural.AggressiveTokenizer();

var sentences = sentenceTok.tokenize(data);
//console.log(sentences);

	// Usage:
	// transformationRules = new BrillTransformationRules();
	// transformationRules.rules.forEach(function(ruleFunction) {
	// ruleFunction(taggedSentence, i);
	// });
	// where taggedSentence is an array of arrays of the form:
	// [[the, DET], [red, JJ], [book, NN]] and i the position to be processed

	function BrillTransformationRules() {
	Error: abandonadas Expected: abandon Got: abandonad
	Error: abba Expected: abba Got: abb
	Error: abdicación Expected: abdic Got: abd
	Error: abismo Expected: abism Got: ab
	Error: abluciones Expected: ablucion Got: ablu
	Error: abogadas Expected: abog Got: abogad
	Error: abordaban Expected: abord Got: abordab
	Error: abordadas Expected: abord Got: abordad
	Error: abra Expected: abra Got: abr
	Error: abre Expected: abre Got: abr
	/*
	Copyright (c) 2014, Luís Rodrigues

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	var regExs = {"e-mail": /(\w[-._\w]\w@\w[-._\w]\w\.\w{2,3})/ig,
	// Matches time of the form 19:20
	"time": /[0-9]{1,2}:[0-9][0-9]/g,
	// This regular expressions matches dates of the form XX/XX/YYYY
	// where XX can be 1 or 2 digits long and YYYY is always 4 digits long.
	"date": /\d{1,2}\/\d{1,2}\/\d{4}/g,
	"zipcode": /[0-9]{1,4}[A-Z]{2}/g,
	// Matches http://210.50.2.215/sd_new/WebBuilder.cgi?RegID=7449046&First=Ok&Upt=Ok&EditPage=3&S
	"uri": /\b([\d\w\.\/\+\-\?\:])((ht\|f)tp(s\|)\:\/\/\|[\d\d\d\|\d\d]\.[\d\d\d\|\d\d]\.\|www\.\|\.tv\|\.ac\|\.com\|\.edu\|\.gov\|\.int\|\.mil\|\.net\|\.org\|\.biz\|\.info\|\.name\|\.pro\|\.museum\|\.co)([\d\w\.\/\%\+\-\=\&\?\:\\\"\'\,\\|\~\;])\b/g

	var fs = require("fs");
	xml2js = require('xml2js');
	var Sentence = require('./Sentence');

	// Folder with corpus
	var corpusPath = "/home/hugo/Workspace/FrenchTreeBank/originXML/current/corpus-constit/";


	// Documentation of the treebank:
	var fs = require('fs');

	var oldFile = './data/kata-dasar.txt';

	var data = fs.readFileSync(oldFile, 'utf-8');

	var words = data.split('\n');

	console.log(words.length);
	var fs = require('fs');
	var inputFile = './spec/test_data/browntag_nolines_excerpt.txt';
	var outputFile = './spec/test_data/browntag_nolines_excerpt.json';;

	var Corpus = require('../../lib/natural/brill_pos_tagger/lib/Corpus');
	var SentenceClass = require('../../lib/natural/brill_pos_tagger/lib/Sentence');

	var data = fs.readFileSync(inputFile, 'utf8');
	var corpus = new Corpus(data, 1, SentenceClass);
	fs.writeFileSync(outputFile, JSON.stringify(corpus, null, 2));
	var fs = require('fs');
	var inputFile = './spec/test_data/snowball_fr.txt';
	var outputFile = './spec/test_data/snowball_fr.json';

	var data = fs.readFileSync(inputFile, 'utf8');

	var lines = data.split(/[\n\r]+/);
	var dict = {};
	lines.forEach(line => {
	[word, stem] = line.split(/\s+/);

	const _ = require("underscore")._;

	const SkipNGrams = require('../lib/natural/ngrams/skip_ngrams');

	const sentence = 'insurgents killed in ongoing fighting';

	const two_skip_tri_grams = [
	['insurgents', 'killed', 'in'],
	['insurgents', 'killed', 'ongoing'],
	var fs = require('fs');
	var natural = require('./lib/natural');

	var data = fs.readFileSync('./data.txt', 'utf8');

	var sentenceTok = new natural.SentenceTokenizer();
	var aggressiveTok = new natural.AggressiveTokenizer();

	var sentences = sentenceTok.tokenize(data);
	//console.log(sentences);