tokenizer
tokenizer utils for NLP
Install
$ npm install nlp-tokenizer
Basic Tokenizer
;// orconst Tokenizer NNTokenizer WordPieceTokenizer = ; let tokenizer = let tokens = tokenizer// tokens == ['one', 'two', 'three', ',', 'four', ',', 'five', '.'] let result = tokenizer// 6 unique words, 9 words, lexical diversity is 6/9// result == [6,9,6/9]
Word Piece Tokenizer
;// orconst Tokenizer NNTokenizer WordPieceTokenizer = ; let token_to_id = 'one': 1 'two': 2 'un': 3 'able': 4 'aff': 5let unknown_token = '[UNK]'let tokenizer = token_to_id unknown_tokenlet t = tokenizer// t == ['un', '##aff', '##able']
NN Tokenizer
;// orconst Tokenizer NNTokenizer WordPieceTokenizer = ; let vocabulary = 'one' 'two' 'three' 'four' 'five' ',' '.'let extra_vocabulary = 'foo' 'bar'let tokenizer = vocabularytokenizerlet t = tokenizer// tokens == ['one', 'two', 'three', ',', 'four', 'foo', ',', 'five', '.'] let encoded = tokenizer// vocabulary ranges from 0 to 6// unknown token takes 7// extra vocabulary ranges from 8 to 9// encoded == [0, 1, 2, 5, 3, 8, 7, 5, 4, 6]