Skip to content

Instantly share code, notes, and snippets.

@dlebech
Last active August 11, 2022 13:34
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save dlebech/5bbabaece36753f8a29e7921d8e5bfc7 to your computer and use it in GitHub Desktop.
Save dlebech/5bbabaece36753f8a29e7921d8e5bfc7 to your computer and use it in GitHub Desktop.
Keras text tokenizer in JavaScript with minimal functionality
// Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
class Tokenizer {
constructor(config = {}) {
this.filters = config.filters || /[\\.,/#!$%^&*;:{}=\-_`~()]/g;
this.lower = typeof config.lower === 'undefined' ? true : config.lower;
// Primary indexing methods. Word to index and index to word.
this.wordIndex = {};
this.indexWord = {};
// Keeping track of word counts
this.wordCounts = {};
}
cleanText(text) {
if (this.lower) text = text.toLowerCase();
return text
.replace(this.filters, '')
.replace(/\s{2,}/g, ' ')
.split(' ');
}
fitOnTexts(texts) {
texts.forEach(text => {
text = this.cleanText(text);
text.forEach(word => {
this.wordCounts[word] = (this.wordCounts[word] || 0) + 1;
});
});
Object.entries(this.wordCounts)
.sort((a, b) => b[1] - a[1])
.forEach(([word, number], i) => {
this.wordIndex[word] = i + 1;
this.indexWord[i + 1] = word;
});
}
textsToSequences(texts) {
return texts.map(text => this.cleanText(text).map(word => this.wordIndex[word] || 0));
}
toJson() {
return JSON.stringify({
wordIndex: this.wordIndex,
indexWord: this.indexWord,
wordCounts: this.wordCounts
})
}
}
export const tokenizerFromJson = json_string => {
const tokenizer = new Tokenizer();
const js = JSON.parse(json_string);
tokenizer.wordIndex = js.wordIndex;
tokenizer.indexWord = js.indexWord;
tokenizer.wordCounts = js.wordCounts;
return tokenizer;
};
export default Tokenizer;
// Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
import Tokenizer, { tokenizerFromJson } from './tokenizer';
describe('Tokenizer', () => {
it('should load from JSON', () => {
const tokenizer = new Tokenizer();
tokenizer.wordIndex = {
hello: 1,
world: 2,
};
tokenizer.indexWord = {
1: 'hello',
2: 'world'
};
const recreated_tokenizer = tokenizerFromJson(tokenizer.toJson());
expect(recreated_tokenizer.wordIndex).toEqual(tokenizer.wordIndex);
expect(recreated_tokenizer.indexWord).toEqual(tokenizer.indexWord);
});
it('should respect the lower flag', () => {
const texts = ['hello hello Hello']
// Test the default assumption
let tokenizer = new Tokenizer();
tokenizer.fitOnTexts(texts);
expect(tokenizer.wordIndex).toEqual({ hello: 1 })
// Test the lowercase flag
tokenizer = new Tokenizer({ lower: false });
tokenizer.fitOnTexts(texts);
expect(tokenizer.wordIndex).toEqual({ hello: 1, Hello: 2 })
});
it('should tokenize texts and store metadata for the texts', () => {
const tokenizer = new Tokenizer();
const texts = [
'hello hello .,/#!$%^&*;:{}= \\ -_`~() hello Hello world world world',
'great success .,/#!$%^&*;:{}=\\-_`~() Success'
];
tokenizer.fitOnTexts(texts);
const sequences = tokenizer.textsToSequences(texts);
expect(tokenizer.wordIndex).toEqual({
hello: 1,
world: 2,
success: 3,
great: 4
});
expect(tokenizer.indexWord).toEqual({
1: 'hello',
2: 'world',
3: 'success',
4: 'great'
});
expect(tokenizer.wordCounts).toEqual({
hello: 4,
world: 3,
success: 2,
great: 1
});
expect(sequences).toEqual([
[1, 1, 1, 1, 2, 2, 2],
[4, 3, 3]
]);
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment