You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

411 lines
10 KiB

// Copyright (c) 2015 The golex Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package lex // import ""
import (
// BOM handling modes which can be set by the BOMMode Option. Default is BOMIgnoreFirst.
const (
BOMError = iota // BOM is an error anywhere.
BOMIgnoreFirst // Skip BOM if at beginning, report as error if anywhere else.
BOMPassAll // No special handling of BOM.
BOMPassFirst // No special handling of BOM if at beginning, report as error if anywhere else.
const (
NonASCII = 0x80 // DefaultRuneClass returns NonASCII for non ASCII runes.
RuneEOF = -1 // Distinct from any valid Unicode rune value.
// DefaultRuneClass returns the character class of r. If r is an ASCII code
// then its class equals the ASCII code. Any other rune is of class NonASCII.
// DefaultRuneClass is the default implementation Lexer will use to convert
// runes (21 bit entities) to scanner classes (8 bit entities).
// Non ASCII aware lexical analyzers will typically use their own
// categorization function. To assign such custom function use the RuneClass
// option.
func DefaultRuneClass(r rune) int {
if r >= 0 && r < 0x80 {
return int(r)
return NonASCII
// Char represents a rune and its position.
type Char struct {
Rune rune
pos int32
// NewChar returns a new Char value.
func NewChar(pos token.Pos, r rune) Char { return Char{pos: int32(pos), Rune: r} }
// IsValid reports whether c is not a zero Char.
func (c Char) IsValid() bool { return c.Pos().IsValid() }
// Pos returns the token.Pos associated with c.
func (c Char) Pos() token.Pos { return token.Pos(c.pos) }
// CharReader is a RuneReader providing additionally explicit position
// information by returning a Char instead of a rune as its first result.
type CharReader interface {
ReadChar() (c Char, size int, err error)
// Lexer suports golex[0] generated lexical analyzers.
type Lexer struct {
File *token.File // The *token.File passed to New.
First Char // First remembers the lookahead char when Rule0 was invoked.
Last Char // Last remembers the last Char returned by Next.
Prev Char // Prev remembers the Char previous to Last.
bomMode int // See the BOM* constants.
bytesBuf bytes.Buffer // Used by TokenBytes.
charSrc CharReader // Lexer alternative input.
classf func(rune) int //
errorf func(token.Pos, string) //
lookahead Char // Lookahead if non zero.
mark int // Longest match marker.
off int // Used for File.AddLine.
src io.RuneReader // Lexer input.
tokenBuf []Char // Lexeme collector.
ungetBuf []Char // Unget buffer.
// New returns a new *Lexer. The result can be amended using opts.
// Non Unicode Input
// To consume sources in other encodings and still have exact position
// information, pass an io.RuneReader which returns the next input character
// reencoded as an Unicode rune but returns the size (number of bytes used to
// encode it) of the original character, not the size of its UTF-8
// representation after converted to an Unicode rune. Size is the second
// returned value of io.RuneReader.ReadRune method[4].
// When src optionally implements CharReader its ReadChar method is used
// instead of io.ReadRune.
func New(file *token.File, src io.RuneReader, opts ...Option) (*Lexer, error) {
r := &Lexer{
File: file,
bomMode: BOMIgnoreFirst,
classf: DefaultRuneClass,
src: src,
if x, ok := src.(CharReader); ok {
r.charSrc = x
r.errorf = r.defaultErrorf
for _, o := range opts {
if err := o(r); err != nil {
return nil, err
return r, nil
// Abort handles the situation when the scanner does not successfully recognize
// any token or when an attempt to find the longest match "overruns" from an
// accepting state only to never reach an accepting state again. In the first
// case the scanner was never in an accepting state since last call to Rule0
// and then (true, previousLookahead rune) is returned, effectively consuming a
// single Char token, avoiding scanner stall. Otherwise there was at least one
// accepting scanner state marked using Mark. In this case Abort rollbacks the
// lexer state to the marked state and returns (false, 0). The scanner must
// then execute a prescribed goto statement. For example:
// %yyc c
// %yyn c = l.Next()
// %yym l.Mark()
// %{
// package foo
// import (...)
// type lexer struct {
// *lex.Lexer
// ...
// }
// func newLexer(...) *lexer {
// return &lexer{
// lex.NewLexer(...),
// ...
// }
// }
// func (l *lexer) scan() int {
// c := l.Enter()
// %}
// ... more lex defintions
// %%
// c = l.Rule0()
// ... lex rules
// %%
// if c, ok := l.Abort(); ok {
// return c
// }
// goto yyAction
// }
func (l *Lexer) Abort() (int, bool) {
if l.mark >= 0 {
if len(l.tokenBuf) > l.mark {
for i := len(l.tokenBuf) - 1; i >= l.mark; i-- {
l.tokenBuf = l.tokenBuf[:l.mark]
return 0, false
switch n := len(l.tokenBuf); n {
case 0: // [] z
c := l.lookahead
return int(c.Rune), true
case 1: // [a] z
return int(l.tokenBuf[0].Rune), true
default: // [a, b, ...], z
c := l.tokenBuf[0] // a
l.Unget(l.lookahead) // z
for i := n - 1; i > 1; i-- {
l.Unget(l.tokenBuf[i]) // ...
l.lookahead = l.tokenBuf[1] // b
l.tokenBuf = l.tokenBuf[:1]
return int(c.Rune), true
func (l *Lexer) class() int { return l.classf(l.lookahead.Rune) }
func (l *Lexer) defaultErrorf(pos token.Pos, msg string) {
l.Error(fmt.Sprintf("%v: %v", l.File.Position(pos), msg))
// Enter ensures the lexer has a valid lookahead Char and returns its class.
// Typical use in an .l file
// func (l *lexer) scan() lex.Char {
// c := l.Enter()
// ...
func (l *Lexer) Enter() int {
if !l.lookahead.IsValid() {
return l.class()
// Error Implements yyLexer[2] by printing the msg to stderr.
func (l *Lexer) Error(msg string) {
fmt.Fprintf(os.Stderr, "%s\n", msg)
// Lookahead returns the current lookahead.
func (l *Lexer) Lookahead() Char {
if !l.lookahead.IsValid() {
return l.lookahead
// Mark records the current state of scanner as accepting. It implements the
// golex macro %yym. Typical usage in an .l file:
// %yym l.Mark()
func (l *Lexer) Mark() { l.mark = len(l.tokenBuf) }
func (l *Lexer) next() int {
const bom = '\ufeff'
if c := l.lookahead; c.IsValid() {
l.tokenBuf = append(l.tokenBuf, c)
if n := len(l.ungetBuf); n != 0 {
l.lookahead = l.ungetBuf[n-1]
l.ungetBuf = l.ungetBuf[:n-1]
return l.class()
if l.src == nil {
return RuneEOF
var r rune
var sz int
var err error
var pos token.Pos
var c Char
off0 :=
switch cs := l.charSrc; {
case cs != nil:
c, sz, err = cs.ReadChar()
r = c.Rune
pos = c.Pos()
r, sz, err = l.src.ReadRune()
pos = l.File.Pos(
} += sz
if err != nil {
l.src = nil
r = RuneEOF
if err != io.EOF {
l.errorf(pos, err.Error())
if r == bom {
switch l.bomMode {
case BOMIgnoreFirst:
if off0 != 0 {
l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
goto again
case BOMPassAll:
// nop
case BOMPassFirst:
if off0 != 0 {
l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
goto again
case BOMError:
switch {
case off0 == 0:
l.errorf(pos, "unicode (UTF-8) BOM at beginnig of file")
l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
goto again
l.lookahead = NewChar(pos, r)
if r == '\n' {
return l.class()
// Next advances the scanner for one rune and returns the respective character
// class of the new lookahead. Typical usage in an .l file:
// %yyn c = l.Next()
func (l *Lexer) Next() int {
l.Prev = l.Last
r :=
l.Last = l.lookahead
return r
// Offset returns the current reading offset of the lexer's source.
func (l *Lexer) Offset() int { return }
// Rule0 initializes the scanner state before the attempt to recognize a token
// starts. The token collecting buffer is cleared. Rule0 records the current
// lookahead in l.First and returns its class. Typical usage in an .l file:
// ... lex definitions
// %%
// c := l.Rule0()
// first-pattern-regexp
func (l *Lexer) Rule0() int {
if !l.lookahead.IsValid() {
l.First = l.lookahead
l.mark = -1
if len(l.tokenBuf) > 1<<18 { //DONE constant tuned
l.tokenBuf = nil
} else {
l.tokenBuf = l.tokenBuf[:0]
return l.class()
// Token returns the currently collected token chars. The result is R/O.
func (l *Lexer) Token() []Char { return l.tokenBuf }
// TokenBytes returns the UTF-8 encoding of Token. If builder is not nil then
// it's called instead to build the encoded token byte value into the buffer
// passed to it.
// The Result is R/O.
func (l *Lexer) TokenBytes(builder func(*bytes.Buffer)) []byte {
if len(l.bytesBuf.Bytes()) < 1<<18 { //DONE constant tuned
} else {
l.bytesBuf = bytes.Buffer{}
switch {
case builder != nil:
for _, c := range l.Token() {
return l.bytesBuf.Bytes()
// Unget unreads all chars in c.
func (l *Lexer) Unget(c ...Char) {
l.ungetBuf = append(l.ungetBuf, c...)
l.lookahead = Char{} // Must invalidate lookahead.
// Option is a function which can be passed as an optional argument to New.
type Option func(*Lexer) error
// BOMMode option selects how the lexer handles BOMs. See the BOM* constants for details.
func BOMMode(mode int) Option {
return func(l *Lexer) error {
l.bomMode = mode
return nil
// ErrorFunc option sets a function called when an, for example I/O error,
// occurs. The default is to call Error with the position and message already
// formated as a string.
func ErrorFunc(f func(token.Pos, string)) Option {
return func(l *Lexer) error {
l.errorf = f
return nil
// RuneClass option sets the function used to convert runes to character
// classes.
func RuneClass(f func(rune) int) Option {
return func(l *Lexer) error {
l.classf = f
return nil