This is a basic lexer written in javascript
yarn add little-lexer
import Lexer from 'little-lexer';
const lex = Lexer();
const result = lex('if var1 = 2.25 then x');
By default every character in the string will lex to a seperate token. So the result will be a token for every character in the string.
[
{ type: 'i', lexeme: 'i' },
{ type: 'f', lexeme: 'f' },
{ type: ' ', lexeme: ' ' },
{ type: 'o', lexeme: 'o' },
{ type: 'b', lexeme: 'b' },
{ type: 'j', lexeme: 'j' },
{ type: '.', lexeme: '.' },
{ type: 'p', lexeme: 'p' },
{ type: 'r', lexeme: 'r' },
{ type: 'o', lexeme: 'o' },
{ type: 'p', lexeme: 'p' },
{ type: ' ', lexeme: ' ' },
{ type: '=', lexeme: '=' },
{ type: ' ', lexeme: ' ' },
{ type: '2', lexeme: '2' },
{ type: '.', lexeme: '.' },
{ type: '2', lexeme: '2' },
{ type: '5', lexeme: '5' },
{ type: ' ', lexeme: ' ' },
{ type: 't', lexeme: 't' },
{ type: 'h', lexeme: 'h' },
{ type: 'e', lexeme: 'e' },
{ type: 'n', lexeme: 'n' },
{ type: ' ', lexeme: ' ' },
{ type: 'v', lexeme: 'v' },
{ type: 'a', lexeme: 'a' },
{ type: 'r', lexeme: 'r' },
{ type: '1', lexeme: '1' },
];
An optional object can be passed to the Lexer to specify your own token types.
import Lexer from 'little-lexer';
const matches = {
' ': 'space',
'=': 'equals',
};
const lex = Lexer(matches);
const result = lex('if var1 = 2.25 then x');
/* [
{ type: 'i', lexeme: 'i' },
{ type: 'f', lexeme: 'f' },
{ type: 'space', lexeme: ' ' },
{ type: 'o', lexeme: 'o' },
{ type: 'b', lexeme: 'b' },
{ type: 'j', lexeme: 'j' },
{ type: '.', lexeme: '.' },
{ type: 'p', lexeme: 'p' },
{ type: 'r', lexeme: 'r' },
{ type: 'o', lexeme: 'o' },
{ type: 'p', lexeme: 'p' },
{ type: 'space', lexeme: ' ' },
{ type: 'equals', lexeme: '=' },
{ type: 'space', lexeme: ' ' },
{ type: '2', lexeme: '2' },
{ type: '.', lexeme: '.' },
{ type: '2', lexeme: '2' },
{ type: '5', lexeme: '5' },
{ type: 'space', lexeme: ' ' },
{ type: 't', lexeme: 't' },
{ type: 'h', lexeme: 'h' },
{ type: 'e', lexeme: 'e' },
{ type: 'n', lexeme: 'n' },
{ type: 'space', lexeme: ' ' },
{ type: 'v', lexeme: 'v' },
{ type: 'a', lexeme: 'a' },
{ type: 'r', lexeme: 'r' },
{ type: '1', lexeme: '1' },
]; */
The keys can also match multiple characters:
import Lexer from 'little-lexer';
const matches = {
if: 'keyword',
then: 'keyword',
' ': 'space',
'=': 'equals',
};
const lex = Lexer(matches);
const result = lex('if var1 = 2.25 then x');
/*
[
{ type: 'keyword', lexeme: 'if' },
{ type: 'space', lexeme: ' ' },
{ type: 'o', lexeme: 'o' },
{ type: 'b', lexeme: 'b' },
{ type: 'j', lexeme: 'j' },
{ type: '.', lexeme: '.' },
{ type: 'p', lexeme: 'p' },
{ type: 'r', lexeme: 'r' },
{ type: 'o', lexeme: 'o' },
{ type: 'p', lexeme: 'p' },
{ type: 'space', lexeme: ' ' },
{ type: 'equals', lexeme: '=' },
{ type: 'space', lexeme: ' ' },
{ type: '2', lexeme: '2' },
{ type: '.', lexeme: '.' },
{ type: '2', lexeme: '2' },
{ type: '5', lexeme: '5' },
{ type: 'space', lexeme: ' ' },
{ type: 'keyword', lexeme: 'then' },
{ type: 'space', lexeme: ' ' },
{ type: 'v', lexeme: 'v' },
{ type: 'a', lexeme: 'a' },
{ type: 'r', lexeme: 'r' },
{ type: '1', lexeme: '1' }
];
*/
The keys are used as regular expressions. So you can use regexes to specify more complex matches.
import Lexer from 'little-lexer';
const matches = {
if: 'keyword',
then: 'keyword',
' ': 'space',
'=': 'equals',
'[0-9]': 'number',
};
const lex = Lexer(matches);
const result = lex('if var1 = 2.25 then x');
/*
[
{ type: 'keyword', lexeme: 'if' },
{ type: 'space', lexeme: ' ' },
{ type: 'o', lexeme: 'o' },
{ type: 'b', lexeme: 'b' },
{ type: 'j', lexeme: 'j' },
{ type: '.', lexeme: '.' },
{ type: 'p', lexeme: 'p' },
{ type: 'r', lexeme: 'r' },
{ type: 'o', lexeme: 'o' },
{ type: 'p', lexeme: 'p' },
{ type: 'space', lexeme: ' ' },
{ type: 'equals', lexeme: '=' },
{ type: 'space', lexeme: ' ' },
{ type: 'number', lexeme: '2' },
{ type: '.', lexeme: '.' },
{ type: 'number', lexeme: '2' },
{ type: 'number', lexeme: '5' },
{ type: 'space', lexeme: ' ' },
{ type: 'keyword', lexeme: 'then' },
{ type: 'space', lexeme: ' ' },
{ type: 'v', lexeme: 'v' },
{ type: 'a', lexeme: 'a' },
{ type: 'r', lexeme: 'r' },
{ type: 'number', lexeme: '1' }
];
*/
The keys being regexes also means that you have to escape characters with a special regex meaning. Not escaping the . character in the next example would not yield the result we want, as the . represents any possible character in a regex. So it would always result in a match for any character instead of matching just the . character.
import Lexer from 'little-lexer';
const matches = {
if: 'keyword',
then: 'keyword',
' ': 'space',
'=': 'equals',
'\\.': 'dot',
'[0-9]': 'number',
};
const lex = Lexer(matches);
const result = lex('if var1 = 2.25 then x');
/*
[
{ type: 'keyword', lexeme: 'if' },
{ type: 'space', lexeme: ' ' },
{ type: 'o', lexeme: 'o' },
{ type: 'b', lexeme: 'b' },
{ type: 'j', lexeme: 'j' },
{ type: 'dot', lexeme: '.' },
{ type: 'p', lexeme: 'p' },
{ type: 'r', lexeme: 'r' },
{ type: 'o', lexeme: 'o' },
{ type: 'p', lexeme: 'p' },
{ type: 'space', lexeme: ' ' },
{ type: 'equals', lexeme: '=' },
{ type: 'space', lexeme: ' ' },
{ type: 'number', lexeme: '2' },
{ type: 'dot', lexeme: '.' },
{ type: 'number', lexeme: '2' },
{ type: 'number', lexeme: '5' },
{ type: 'space', lexeme: ' ' },
{ type: 'keyword', lexeme: 'then' },
{ type: 'space', lexeme: ' ' },
{ type: 'v', lexeme: 'v' },
{ type: 'a', lexeme: 'a' },
{ type: 'r', lexeme: 'r' },
{ type: 'number', lexeme: '1' }
];
*/
Sometimes the lexer needs to have different states. An optional object can be passed to the Lexer function as a second argument to specify other states.
If we encountered a digit for example, and would like to have the . character to be lexed as part of the floating number and not a seperate 'dot' token, we should specify a new state. So we introduce a new "number" state, also with a set of matches to specify state transitions. While in a custom state, the lexer only uses the matches specified in that state. In the next example, if the lexer encounters a digit [0-9] it transitions to the "number" state. While being in the number state, only [0-9\\.] is tested and when matched, keeps the lexer in the number state. If no match was found, the lexer transitions back to the starting state and a token is generated.
import Lexer from 'little-lexer';
const matches = {
if: 'keyword',
then: 'keyword',
' ': 'space',
'=': 'equals',
'\\.': 'dot',
'[0-9]': 'number',
};
const states = {
number: {
'[0-9\\.]': 'number',
},
};
const lex = Lexer(matches, states);
const result = lex('if var1 = 2.25 then x');
/*
[
{ type: 'keyword', lexeme: 'if' },
{ type: 'space', lexeme: ' ' },
{ type: 'o', lexeme: 'o' },
{ type: 'b', lexeme: 'b' },
{ type: 'j', lexeme: 'j' },
{ type: 'dot', lexeme: '.' },
{ type: 'p', lexeme: 'p' },
{ type: 'r', lexeme: 'r' },
{ type: 'o', lexeme: 'o' },
{ type: 'p', lexeme: 'p' },
{ type: 'space', lexeme: ' ' },
{ type: 'equals', lexeme: '=' },
{ type: 'space', lexeme: ' ' },
{ type: 'number', lexeme: '2.25' },
{ type: 'space', lexeme: ' ' },
{ type: 'keyword', lexeme: 'then' },
{ type: 'space', lexeme: ' ' },
{ type: 'v', lexeme: 'v' },
{ type: 'a', lexeme: 'a' },
{ type: 'r', lexeme: 'r' },
{ type: 'number', lexeme: '1' }
];
*/
If we want to allow lexing of identifiers that contain, but not start with, numbers and capital letters, we can again introduce a new state for lexing identifiers. Only a lowercase [a-z] transitions to the "identifier" state. When in the "identifier" state, [a-zA-Z0-9] keeps the lexer in the "identifier" state.
import Lexer from 'little-lexer';
const matches = {
if: 'keyword',
then: 'keyword',
' ': 'space',
'=': 'equals',
'\\.': 'dot',
'[a-z]': 'identifier',
'[0-9]': 'number',
};
const states = {
number: {
'[0-9\\.]': 'number',
},
identifier: {
'[a-zA-Z0-9]': 'identifier',
},
};
const lex = Lexer(matches, states);
const result = lex('if var1 = 2.25 then x');
/*
[
{ type: 'keyword', lexeme: 'if' },
{ type: 'space', lexeme: ' ' },
{ type: 'identifier', lexeme: 'obj' },
{ type: 'dot', lexeme: '.' },
{ type: 'identifier', lexeme: 'prop' },
{ type: 'space', lexeme: ' ' },
{ type: 'equals', lexeme: '=' },
{ type: 'space', lexeme: ' ' },
{ type: 'number', lexeme: '2.25' },
{ type: 'space', lexeme: ' ' },
{ type: 'keyword', lexeme: 'then' },
{ type: 'space', lexeme: ' ' },
{ type: 'identifier', lexeme: 'var1' }
];
*/