Every compiler book will tell you to use a lexer generator. Flex, re2c, ANTLR — they’re battle-tested, they handle Unicode edge cases you’ve never thought about, and they generate code that’s probably faster than what you’d write by hand.
I still think you should write one from scratch at least once.
Not because you’ll use it in production — you won’t. But because the moment you understand how a lexer actually works at the character level, a lot of other things click: why some tokenisation decisions are arbitrary, why error messages in real compilers are the way they are, why certain syntax rules exist purely to make parsing unambiguous.
Here’s a minimal one.
The setup
We’re lexing a made-up language I’ll call Petal. It has:
- Integer and float literals
- String literals (double-quoted,
\"escape) - Identifiers and keywords (
if,else,fn,let,return) - Operators:
+ - * / = == != < > <= >= - Delimiters:
( ) { } [ ] , ; : - Single-line comments:
// ... - Whitespace (ignored)
The token type enum:
typedef enum {
TOK_INT, TOK_FLOAT, TOK_STRING, TOK_IDENT,
TOK_PLUS, TOK_MINUS, TOK_STAR, TOK_SLASH,
TOK_EQ, TOK_EQEQ, TOK_NEQ, TOK_LT, TOK_GT, TOK_LTE, TOK_GTE,
TOK_LPAREN, TOK_RPAREN, TOK_LBRACE, TOK_RBRACE,
TOK_LBRACKET, TOK_RBRACKET,
TOK_COMMA, TOK_SEMI, TOK_COLON,
TOK_KW_IF, TOK_KW_ELSE, TOK_KW_FN, TOK_KW_LET, TOK_KW_RETURN,
TOK_EOF, TOK_ERROR,
} TokenKind;
typedef struct {
TokenKind kind;
const char *start; // pointer into source
int len; // byte length
int line;
} Token;
Storing a pointer into the source and a length avoids allocation — every token is just a view into the original source string. This is the same strategy used in Go’s token.Pos and Zig’s std.zig.Tokenizer.
The lexer state
typedef struct {
const char *src; // full source
const char *cur; // current position
const char *line_start;
int line;
} Lexer;
void lexer_init(Lexer *l, const char *src) {
l->src = src;
l->cur = src;
l->line_start = src;
l->line = 1;
}
static char peek(Lexer *l) { return *l->cur; }
static char peek2(Lexer *l) { return l->cur[1]; }
static char advance(Lexer *l) { return *l->cur++; }
static int at_end(Lexer *l) { return *l->cur == '\0'; }
Keeping line_start lets us compute column numbers cheaply: col = cur - line_start + 1.
The main scan loop
Token lexer_next(Lexer *l) {
// Skip whitespace and comments
for (;;) {
while (!at_end(l) && (peek(l) == ' ' || peek(l) == '\t' || peek(l) == '\r'))
advance(l);
if (peek(l) == '\n') { l->line++; l->line_start = l->cur + 1; advance(l); continue; }
if (peek(l) == '/' && peek2(l) == '/') {
while (!at_end(l) && peek(l) != '\n') advance(l);
continue;
}
break;
}
if (at_end(l)) return make_tok(l, TOK_EOF, l->cur, 0);
const char *start = l->cur;
char c = advance(l);
// Single-character tokens
switch (c) {
case '(': return make_tok(l, TOK_LPAREN, start, 1);
case ')': return make_tok(l, TOK_RPAREN, start, 1);
case '{': return make_tok(l, TOK_LBRACE, start, 1);
case '}': return make_tok(l, TOK_RBRACE, start, 1);
case '[': return make_tok(l, TOK_LBRACKET, start, 1);
case ']': return make_tok(l, TOK_RBRACKET, start, 1);
case ',': return make_tok(l, TOK_COMMA, start, 1);
case ';': return make_tok(l, TOK_SEMI, start, 1);
case ':': return make_tok(l, TOK_COLON, start, 1);
case '+': return make_tok(l, TOK_PLUS, start, 1);
case '-': return make_tok(l, TOK_MINUS, start, 1);
case '*': return make_tok(l, TOK_STAR, start, 1);
case '/': return make_tok(l, TOK_SLASH, start, 1);
}
// One-or-two character tokens
if (c == '=') return match(l, '=') ? make_tok(l, TOK_EQEQ, start, 2)
: make_tok(l, TOK_EQ, start, 1);
if (c == '!') return match(l, '=') ? make_tok(l, TOK_NEQ, start, 2)
: make_tok(l, TOK_ERROR, start, 1);
if (c == '<') return match(l, '=') ? make_tok(l, TOK_LTE, start, 2)
: make_tok(l, TOK_LT, start, 1);
if (c == '>') return match(l, '=') ? make_tok(l, TOK_GTE, start, 2)
: make_tok(l, TOK_GT, start, 1);
// Strings
if (c == '"') return lex_string(l, start);
// Numbers
if (c >= '0' && c <= '9') return lex_number(l, start);
// Identifiers and keywords
if (is_alpha(c)) return lex_ident(l, start);
return make_tok(l, TOK_ERROR, start, 1);
}
Numbers and strings
static Token lex_number(Lexer *l, const char *start) {
while (peek(l) >= '0' && peek(l) <= '9') advance(l);
if (peek(l) == '.' && peek2(l) >= '0' && peek2(l) <= '9') {
advance(l); // consume '.'
while (peek(l) >= '0' && peek(l) <= '9') advance(l);
return make_tok(l, TOK_FLOAT, start, (int)(l->cur - start));
}
return make_tok(l, TOK_INT, start, (int)(l->cur - start));
}
static Token lex_string(Lexer *l, const char *start) {
while (!at_end(l) && peek(l) != '"') {
if (peek(l) == '\\') advance(l); // skip escape char
if (peek(l) == '\n') { l->line++; l->line_start = l->cur + 1; }
advance(l);
}
if (at_end(l)) return make_tok(l, TOK_ERROR, start, (int)(l->cur - start));
advance(l); // closing "
return make_tok(l, TOK_STRING, start, (int)(l->cur - start));
}
The string lexer stores the raw source bytes including quotes. The parser’s job is to unescape them when building the AST — that separation keeps the lexer simple.
Keywords via a trie (sort of)
Rather than a hash map, I use a sorted array of {string, TokenKind} pairs and binary search. For five keywords it’s plenty fast and zero-allocation:
static const struct { const char *kw; int len; TokenKind kind; } keywords[] = {
{"else", 4, TOK_KW_ELSE},
{"fn", 2, TOK_KW_FN},
{"if", 2, TOK_KW_IF},
{"let", 3, TOK_KW_LET},
{"return", 6, TOK_KW_RETURN},
};
static Token lex_ident(Lexer *l, const char *start) {
while (is_alpha(peek(l)) || (peek(l) >= '0' && peek(l) <= '9') || peek(l) == '_')
advance(l);
int len = (int)(l->cur - start);
// Linear scan is fine for 5 keywords
for (int i = 0; i < 5; i++) {
if (keywords[i].len == len && memcmp(keywords[i].kw, start, len) == 0)
return make_tok(l, keywords[i].kind, start, len);
}
return make_tok(l, TOK_IDENT, start, len);
}
What I’d do differently for a real language
- Unicode identifiers:
is_alphaabove only handles ASCII. For real support you need to decode UTF-8 code points and check Unicode categories —std.unicode.isAlphabeticin Zig,unicode.IsLetterin Go. - Error recovery: Instead of returning
TOK_ERRORand stopping, a production lexer skips to the next synchronisation point (usually a semicolon or newline) and keeps going. Better error messages require reporting multiple errors per file, not just the first one. - Indentation-sensitive syntax: If your language is Python-like, the lexer needs to track indent/dedent as virtual tokens. This is the most painful part of lexing Python.
- String interning: For large programs with many repeated identifiers, allocating a string per token is slow. Use an intern table so all occurrences of
"foo"share the same pointer — O(1) identifier comparison becomes pointer equality.
The full source (about 280 lines including the test harness) is in the repo.