Created
April 26, 2018 20:18
-
-
Save Hugo-ter-Doest/2f14fe14c4d7319e9de396d77dff3cf6 to your computer and use it in GitHub Desktop.
NER based on regular expressions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var regExs = {"e-mail": /(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})/ig, | |
// Matches time of the form 19:20 | |
"time": /[0-9]{1,2}:[0-9][0-9]/g, | |
// This regular expressions matches dates of the form XX/XX/YYYY | |
// where XX can be 1 or 2 digits long and YYYY is always 4 digits long. | |
"date": /\d{1,2}\/\d{1,2}\/\d{4}/g, | |
"zipcode": /[0-9]{1,4}[A-Z]{2}/g, | |
// Matches http://210.50.2.215/sd_new/WebBuilder.cgi?RegID=7449046&First=Ok&Upt=Ok&EditPage=3&S | |
"uri": /\b([\d\w\.\/\+\-\?\:]*)((ht|f)tp(s|)\:\/\/|[\d\d\d|\d\d]\.[\d\d\d|\d\d]\.|www\.|\.tv|\.ac|\.com|\.edu|\.gov|\.int|\.mil|\.net|\.org|\.biz|\.info|\.name|\.pro|\.museum|\.co)([\d\w\.\/\%\+\-\=\&\?\:\\\"\'\,\|\~\;]*)\b/g | |
}; | |
var str = "this is a hz@hotmai.com string as you hwl@terdoest.info can read 19:20 9:10 31/2/2018 7559AH https://kennisbank.dimpact.nl/jira"; | |
var edges = []; | |
function matchRegEx(cat, regex, matches) { | |
while (match = regex.exec(str)) { | |
console.log(match); | |
console.log(match.index + ' ' + regex.lastIndex); | |
edges.push({"matchedString": match[0], | |
"category": cat, | |
"start": match.index, | |
"end": regex.lastIndex | |
}); | |
} | |
} | |
Object.keys(regExs).forEach(function(cat) { | |
matchRegEx(cat, regExs[cat], edges) | |
}); | |
console.log(edges); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment