Skip to content

Instantly share code, notes, and snippets.

@mrozbarry
Created October 15, 2018 13:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrozbarry/24766912dbe2076f2cc5a07c4b3e2a5a to your computer and use it in GitHub Desktop.
Save mrozbarry/24766912dbe2076f2cc5a07c4b3e2a5a to your computer and use it in GitHub Desktop.
String tokenizer
class Tokenizer {
static empty() {
return new Tokenizer('text', []);
}
static default() {
return Tokenizer.empty()
.mapToType(Tokenizer.presets.link, 'link')
.mapToType(Tokenizer.presets.mention, 'mention');
}
constructor(fallbackType, matchers) {
this.fallbackType = fallbackType;
this.matchers = matchers;
}
mapToType(regularExpression, type) {
this.matchers.push({ type, regularExpression });
return this;
}
getNextTokenizer() {
return new Tokenizer(this.fallbackType, this.matchers.slice(1));
}
getMatchForText(text) {
const matcher = this.matchers[0];
const expression = new RegExp(matcher.regularExpression);
const match = expression.exec(text);
return match
? {
start: match.index,
end: match.index + match[0].length,
type: matcher.type,
value: match[0],
}
: null;
}
tokenize(text) {
if (!text) return [];
if (this.matchers.length === 0) return [{ type: this.fallbackType, value: text }];
const match = this.getMatchForText(text);
return match ? this.tokenizeWithMatch(match, text) : this.tokenizeWithoutMatch(text);
}
tokenizeWithMatch(match, text) {
const textBeforeMatch = text.slice(0, match.start);
const textAfterMatch = text.slice(match.end);
const currentMatch = { type: match.type, value: match.value };
return [
...this.getNextTokenizer().tokenize(textBeforeMatch),
currentMatch,
...this.tokenize(textAfterMatch),
];
}
tokenizeWithoutMatch(text) {
return this.getNextTokenizer().tokenize(text);
}
}
Tokenizer.presets = {
mention: /@[^\s,\.]{3,}/,
link: /https?:\/\/\S+/,
};
module.exports = {
Tokenizer,
};
const test = require('ava');
const { Tokenizer } = require('./Tokenizer.js');
const fixtures = {
plain: 'Some text here',
withMentions: 'Some @mention here and @here',
withLinks: 'Look at this http://link.com/skjfgje-gnfg/190 or https://this.one/ or not',
withMentionsAndLinks: '@here is a https://link.com/@not-mention okay',
realistic: `Hi @nicholeburton,
I followed your advice and checked out https://some-website-here.com/kg3408hrong-34rbg/fdkbdfhb4kjr but it doesn't show any listings.
Did you mean https://some-other-similar-website.com/real-path instead? That looks like the right thing.`,
};
test('it creates a single text token with no matchers', (t) => {
Object.values(fixtures).forEach((value) => {
t.deepEqual(
Tokenizer.empty().tokenize(value),
[{ type: 'text', value }],
);
});
});
test('it tokenizes mentions', (t) => {
const tokenizer = Tokenizer.empty().mapToType(Tokenizer.presets.mention, 'mention');
t.deepEqual(
tokenizer.tokenize(fixtures.plain),
[{ type: 'text', value: fixtures.plain }],
);
t.deepEqual(
tokenizer.tokenize(fixtures.withMentions),
[
{ type: 'text', value: 'Some ' },
{ type: 'mention', value: '@mention' },
{ type: 'text', value: ' here and ' },
{ type: 'mention', value: '@here' },
],
);
t.deepEqual(
tokenizer.tokenize(fixtures.withLinks),
[
{ type: 'text', value: 'Look at this http://link.com/skjfgje-gnfg/190 or https://this.one/ or not' },
],
);
t.deepEqual(
tokenizer.tokenize(fixtures.withMentionsAndLinks),
[
{ type: 'mention', value: '@here' },
{ type: 'text', value: ' is a https://link.com/' },
{ type: 'mention', value: '@not-mention' },
{ type: 'text', value: ' okay' },
],
);
});
test('it respects matcher priority order', (t) => {
const mentionThenUrl = Tokenizer
.empty()
.mapToType(Tokenizer.presets.mention, 'mention')
.mapToType(Tokenizer.presets.link, 'link');
const urlThenMention = Tokenizer
.empty()
.mapToType(Tokenizer.presets.link, 'link')
.mapToType(Tokenizer.presets.mention, 'mention');
t.deepEqual(
mentionThenUrl.tokenize(fixtures.withMentionsAndLinks),
[
{ type: 'mention', value: '@here' },
{ type: 'text', value: ' is a ' },
{ type: 'link', value: 'https://link.com/' },
{ type: 'mention', value: '@not-mention' },
{ type: 'text', value: ' okay' },
],
);
t.deepEqual(
urlThenMention.tokenize(fixtures.withMentionsAndLinks),
[
{ type: 'mention', value: '@here' },
{ type: 'text', value: ' is a ' },
{ type: 'link', value: 'https://link.com/@not-mention' },
{ type: 'text', value: ' okay' },
],
);
});
test('it behaves realistically', (t) => {
const defaultTokenizer = Tokenizer.default();
t.deepEqual(
defaultTokenizer.tokenize(fixtures.realistic),
[
{ type: 'text', value: 'Hi ' },
{ type: 'mention', value: '@nicholeburton' },
{ type: 'text', value: `,
I followed your advice and checked out ` },
{ type: 'link', value: 'https://some-website-here.com/kg3408hrong-34rbg/fdkbdfhb4kjr' },
{ type: 'text', value: ` but it doesn't show any listings.
Did you mean ` },
{ type: 'link', value: 'https://some-other-similar-website.com/real-path' },
{ type: 'text', value: ` instead? That looks like the right thing.` },
],
);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment