Skip to content

Instantly share code, notes, and snippets.

View Hugo-ter-Doest's full-sized avatar
💭
Working on named entity recognition for NaturalNode

Hugo ter Doest Hugo-ter-Doest

💭
Working on named entity recognition for NaturalNode
View GitHub Profile
// break string up in to sentences based on punctation and quotation marks
var tokens = text.match(/(?<=\s+|^)[\"\'\‘\“\'\"\[\(\{\⟨](.*?[.?!])(\s[.?!])*[\"\'\’\”\'\"\]\)\}\⟩](?=\s+|$)|(?<=\s+|^)\S(.*?[.?!])(\s[.?!])*(?=\s+|$)/g);
/* Breakdown of the regular expression
/
(?<=\s+|^) // Lookbehind is whitespace or start-of-input FOLLOWED BY:
[\"\'\‘\“\'\"\[\(\{\⟨]? // Any brackets or quotation characters (optional)
(.*?[.?!]) // any sequence of characters followed by punctation
(\s[.?!])* // any combination of whitespace and punctuation
[\"\'\’\”\'\"\]\)\}\⟩]? // Any brackets or quotation characters (optional)
@Hugo-ter-Doest
Hugo-ter-Doest / carry_spec.js
Created December 13, 2020 09:55
carry_spec.js
const NodeCarry = require('../src/index.js').NodeCarry;
const nc = new NodeCarry();
const tests = [
{
input: "action",
output: "ac"
},
{
input: "acteur",
[ 'number',
'wquctgtkut',
'number',
'wquctgtkut',
'number',
'wquctgtkut',
'o',
'join' ]
[ 'mail', 'an', 'empty', 'message', 'to', 'o', 'email', 'imkxepvqcp' ]
[ 'v',
var fs = require('fs');
var natural = require('./lib/natural');
var data = fs.readFileSync('./data.txt', 'utf8');
var sentenceTok = new natural.SentenceTokenizer();
var aggressiveTok = new natural.AggressiveTokenizer();
var sentences = sentenceTok.tokenize(data);
//console.log(sentences);
const _ = require("underscore")._;
const SkipNGrams = require('../lib/natural/ngrams/skip_ngrams');
const sentence = 'insurgents killed in ongoing fighting';
const two_skip_tri_grams = [
['insurgents', 'killed', 'in'],
['insurgents', 'killed', 'ongoing'],
@Hugo-ter-Doest
Hugo-ter-Doest / ConvertSnowball.js
Created January 14, 2019 09:22
Convert Snowball file
var fs = require('fs');
var inputFile = './spec/test_data/snowball_fr.txt';
var outputFile = './spec/test_data/snowball_fr.json';
var data = fs.readFileSync(inputFile, 'utf8');
var lines = data.split(/[\n\r]+/);
var dict = {};
lines.forEach(line => {
[word, stem] = line.split(/\s+/);
@Hugo-ter-Doest
Hugo-ter-Doest / ConvertBrownCorpus.js
Last active January 13, 2019 20:49
Converts a flat Brown corpus to an object consisting of tagged sentences
var fs = require('fs');
var inputFile = './spec/test_data/browntag_nolines_excerpt.txt';
var outputFile = './spec/test_data/browntag_nolines_excerpt.json';;
var Corpus = require('../../lib/natural/brill_pos_tagger/lib/Corpus');
var SentenceClass = require('../../lib/natural/brill_pos_tagger/lib/Sentence');
var data = fs.readFileSync(inputFile, 'utf8');
var corpus = new Corpus(data, 1, SentenceClass);
fs.writeFileSync(outputFile, JSON.stringify(corpus, null, 2));
@Hugo-ter-Doest
Hugo-ter-Doest / convertFileIndondesian.js
Created August 14, 2018 13:23
Convert kata-dasar.txt to JSON
var fs = require('fs');
var oldFile = './data/kata-dasar.txt';
var data = fs.readFileSync(oldFile, 'utf-8');
var words = data.split('\n');
console.log(words.length);
@Hugo-ter-Doest
Hugo-ter-Doest / processFTB.js
Last active June 4, 2018 22:47
Process French tree bank to create sentences that are POS tagged
var fs = require("fs");
xml2js = require('xml2js');
var Sentence = require('./Sentence');
// Folder with corpus
var corpusPath = "/home/hugo/Workspace/FrenchTreeBank/originXML/current/corpus-constit/";
// Documentation of the treebank:
@Hugo-ter-Doest
Hugo-ter-Doest / NER_RegExp.js
Created April 26, 2018 20:18
NER based on regular expressions
var regExs = {"e-mail": /(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})/ig,
// Matches time of the form 19:20
"time": /[0-9]{1,2}:[0-9][0-9]/g,
// This regular expressions matches dates of the form XX/XX/YYYY
// where XX can be 1 or 2 digits long and YYYY is always 4 digits long.
"date": /\d{1,2}\/\d{1,2}\/\d{4}/g,
"zipcode": /[0-9]{1,4}[A-Z]{2}/g,
// Matches http://210.50.2.215/sd_new/WebBuilder.cgi?RegID=7449046&amp;First=Ok&amp;Upt=Ok&amp;EditPage=3&amp;S
"uri": /\b([\d\w\.\/\+\-\?\:]*)((ht|f)tp(s|)\:\/\/|[\d\d\d|\d\d]\.[\d\d\d|\d\d]\.|www\.|\.tv|\.ac|\.com|\.edu|\.gov|\.int|\.mil|\.net|\.org|\.biz|\.info|\.name|\.pro|\.museum|\.co)([\d\w\.\/\%\+\-\=\&amp;\?\:\\\&quot;\'\,\|\~\;]*)\b/g