dorm/vendor/modernc.org/golex/lex/api.go

// Copyright (c) 2015 The golex Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package lex // import "modernc.org/golex/lex"

import (
	"bytes"
	"fmt"
	"go/token"
	"io"
	"os"
)

// BOM handling modes which can be set by the BOMMode Option. Default is BOMIgnoreFirst.
const (
	BOMError       = iota // BOM is an error anywhere.
	BOMIgnoreFirst        // Skip BOM if at beginning, report as error if anywhere else.
	BOMPassAll            // No special handling of BOM.
	BOMPassFirst          // No special handling of BOM if at beginning, report as error if anywhere else.
)

const (
	NonASCII = 0x80 // DefaultRuneClass returns NonASCII for non ASCII runes.
	RuneEOF  = -1   // Distinct from any valid Unicode rune value.
)

// DefaultRuneClass returns the character class of r. If r is an ASCII code
// then its class equals the ASCII code. Any other rune is of class NonASCII.
//
// DefaultRuneClass is the default implementation Lexer will use to convert
// runes (21 bit entities) to scanner classes (8 bit entities).
//
// Non ASCII aware lexical analyzers will typically use their own
// categorization function. To assign such custom function use the RuneClass
// option.
func DefaultRuneClass(r rune) int {
	if r >= 0 && r < 0x80 {
		return int(r)
	}

	return NonASCII
}

// Char represents a rune and its position.
type Char struct {
	Rune rune
	pos  int32
}

// NewChar returns a new Char value.
func NewChar(pos token.Pos, r rune) Char { return Char{pos: int32(pos), Rune: r} }

// IsValid reports whether c is not a zero Char.
func (c Char) IsValid() bool { return c.Pos().IsValid() }

// Pos returns the token.Pos associated with c.
func (c Char) Pos() token.Pos { return token.Pos(c.pos) }

// CharReader is a RuneReader providing additionally explicit position
// information by returning a Char instead of a rune as its first result.
type CharReader interface {
	ReadChar() (c Char, size int, err error)
}

// Lexer suports golex[0] generated lexical analyzers.
type Lexer struct {
	File      *token.File             // The *token.File passed to New.
	First     Char                    // First remembers the lookahead char when Rule0 was invoked.
	Last      Char                    // Last remembers the last Char returned by Next.
	Prev      Char                    // Prev remembers the Char previous to Last.
	bomMode   int                     // See the BOM* constants.
	bytesBuf  bytes.Buffer            // Used by TokenBytes.
	charSrc   CharReader              // Lexer alternative input.
	classf    func(rune) int          //
	errorf    func(token.Pos, string) //
	lookahead Char                    // Lookahead if non zero.
	mark      int                     // Longest match marker.
	off       int                     // Used for File.AddLine.
	src       io.RuneReader           // Lexer input.
	tokenBuf  []Char                  // Lexeme collector.
	ungetBuf  []Char                  // Unget buffer.
}

// New returns a new *Lexer. The result can be amended using opts.
//
// Non Unicode Input
//
// To consume sources in other encodings and still have exact position
// information, pass an io.RuneReader which returns the next input character
// reencoded as an Unicode rune but returns the size (number of bytes used to
// encode it) of the original character, not the size of its UTF-8
// representation after converted to an Unicode rune.  Size is the second
// returned value of io.RuneReader.ReadRune method[4].
//
// When src optionally implements CharReader its ReadChar method is used
// instead of io.ReadRune.
func New(file *token.File, src io.RuneReader, opts ...Option) (*Lexer, error) {
	r := &Lexer{
		File:    file,
		bomMode: BOMIgnoreFirst,
		classf:  DefaultRuneClass,
		src:     src,
	}
	if x, ok := src.(CharReader); ok {
		r.charSrc = x
	}
	r.errorf = r.defaultErrorf
	for _, o := range opts {
		if err := o(r); err != nil {
			return nil, err
		}
	}
	return r, nil
}

// Abort handles the situation when the scanner does not successfully recognize
// any token or when an attempt to find the longest match "overruns" from an
// accepting state only to never reach an accepting state again. In the first
// case the scanner was never in an accepting state since last call to Rule0
// and then (true, previousLookahead rune) is returned, effectively consuming a
// single Char token, avoiding scanner stall.  Otherwise there was at least one
// accepting scanner state marked using Mark. In this case Abort rollbacks the
// lexer state to the marked state and returns (false, 0). The scanner must
// then execute a prescribed goto statement. For example:
//
//	%yyc c
//	%yyn c = l.Next()
//	%yym l.Mark()
//
//	%{
//	package foo
//
//	import (...)
//
//	type lexer struct {
//		*lex.Lexer
//		...
//	}
//
//	func newLexer(...) *lexer {
//		return &lexer{
//			lex.NewLexer(...),
//			...
//		}
//	}
//
//	func (l *lexer) scan() int {
//	        c := l.Enter()
//	%}
//
//	... more lex defintions
//
//	%%
//
//	        c = l.Rule0()
//
//	... lex rules
//
//	%%
//
//		if c, ok := l.Abort(); ok {
//			return c
//		}
//
//		goto yyAction
//	}
func (l *Lexer) Abort() (int, bool) {
	if l.mark >= 0 {
		if len(l.tokenBuf) > l.mark {
			l.Unget(l.lookahead)
			for i := len(l.tokenBuf) - 1; i >= l.mark; i-- {
				l.Unget(l.tokenBuf[i])
			}
		}
		l.tokenBuf = l.tokenBuf[:l.mark]
		return 0, false
	}

	switch n := len(l.tokenBuf); n {
	case 0: // [] z
		c := l.lookahead
		l.Next()
		return int(c.Rune), true
	case 1: // [a] z
		return int(l.tokenBuf[0].Rune), true
	default: // [a, b, ...], z
		c := l.tokenBuf[0]   // a
		l.Unget(l.lookahead) // z
		for i := n - 1; i > 1; i-- {
			l.Unget(l.tokenBuf[i]) // ...
		}
		l.lookahead = l.tokenBuf[1] // b
		l.tokenBuf = l.tokenBuf[:1]
		return int(c.Rune), true
	}
}

func (l *Lexer) class() int { return l.classf(l.lookahead.Rune) }

func (l *Lexer) defaultErrorf(pos token.Pos, msg string) {
	l.Error(fmt.Sprintf("%v: %v", l.File.Position(pos), msg))
}

// Enter ensures the lexer has a valid lookahead Char and returns its class.
// Typical use in an .l file
//
//	func (l *lexer) scan() lex.Char {
//		c := l.Enter()
//		...
func (l *Lexer) Enter() int {
	if !l.lookahead.IsValid() {
		l.Next()
	}
	return l.class()
}

// Error Implements yyLexer[2] by printing the msg to stderr.
func (l *Lexer) Error(msg string) {
	fmt.Fprintf(os.Stderr, "%s\n", msg)
}

// Lookahead returns the current lookahead.
func (l *Lexer) Lookahead() Char {
	if !l.lookahead.IsValid() {
		l.Next()
	}
	return l.lookahead
}

// Mark records the current state of scanner as accepting. It implements the
// golex macro %yym. Typical usage in an .l file:
//
//	%yym l.Mark()
func (l *Lexer) Mark() { l.mark = len(l.tokenBuf) }

func (l *Lexer) next() int {
	const bom = '\ufeff'

	if c := l.lookahead; c.IsValid() {
		l.tokenBuf = append(l.tokenBuf, c)
	}
	if n := len(l.ungetBuf); n != 0 {
		l.lookahead = l.ungetBuf[n-1]
		l.ungetBuf = l.ungetBuf[:n-1]
		return l.class()
	}

	if l.src == nil {
		return RuneEOF
	}

	var r rune
	var sz int
	var err error
	var pos token.Pos
	var c Char
again:
	off0 := l.off
	switch cs := l.charSrc; {
	case cs != nil:
		c, sz, err = cs.ReadChar()
		r = c.Rune
		pos = c.Pos()
	default:
		r, sz, err = l.src.ReadRune()
		pos = l.File.Pos(l.off)
	}
	l.off += sz
	if err != nil {
		l.src = nil
		r = RuneEOF
		if err != io.EOF {
			l.errorf(pos, err.Error())
		}
	}

	if r == bom {
		switch l.bomMode {
		default:
			fallthrough
		case BOMIgnoreFirst:
			if off0 != 0 {
				l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
			}
			goto again
		case BOMPassAll:
			// nop
		case BOMPassFirst:
			if off0 != 0 {
				l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
				goto again
			}
		case BOMError:
			switch {
			case off0 == 0:
				l.errorf(pos, "unicode (UTF-8) BOM at beginnig of file")
			default:
				l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
			}
			goto again
		}
	}

	l.lookahead = NewChar(pos, r)
	if r == '\n' {
		l.File.AddLine(l.off)
	}
	return l.class()
}

// Next advances the scanner for one rune and returns the respective character
// class of the new lookahead.  Typical usage in an .l file:
//
//	%yyn c = l.Next()
func (l *Lexer) Next() int {
	l.Prev = l.Last
	r := l.next()
	l.Last = l.lookahead
	return r
}

// Offset returns the current reading offset of the lexer's source.
func (l *Lexer) Offset() int { return l.off }

// Rule0 initializes the scanner state before the attempt to recognize a token
// starts. The token collecting buffer is cleared.  Rule0 records the current
// lookahead in l.First and returns its class.  Typical usage in an .l file:
//
//	... lex definitions
//
//	%%
//
//		c := l.Rule0()
//
//	first-pattern-regexp
func (l *Lexer) Rule0() int {
	if !l.lookahead.IsValid() {
		l.Next()
	}
	l.First = l.lookahead
	l.mark = -1
	if len(l.tokenBuf) > 1<<18 { //DONE constant tuned
		l.tokenBuf = nil
	} else {
		l.tokenBuf = l.tokenBuf[:0]
	}
	return l.class()
}

// Token returns the currently collected token chars. The result is R/O.
func (l *Lexer) Token() []Char { return l.tokenBuf }

// TokenBytes returns the UTF-8 encoding of Token. If builder is not nil then
// it's called instead to build the encoded token byte value into the buffer
// passed to it.
//
// The Result is R/O.
func (l *Lexer) TokenBytes(builder func(*bytes.Buffer)) []byte {
	if len(l.bytesBuf.Bytes()) < 1<<18 { //DONE constant tuned
		l.bytesBuf.Reset()
	} else {
		l.bytesBuf = bytes.Buffer{}
	}
	switch {
	case builder != nil:
		builder(&l.bytesBuf)
	default:
		for _, c := range l.Token() {
			l.bytesBuf.WriteRune(c.Rune)
		}
	}
	return l.bytesBuf.Bytes()
}

// Unget unreads all chars in c.
func (l *Lexer) Unget(c ...Char) {
	l.ungetBuf = append(l.ungetBuf, c...)
	l.lookahead = Char{} // Must invalidate lookahead.
}

// Option is a function which can be passed as an optional argument to New.
type Option func(*Lexer) error

// BOMMode option selects how the lexer handles BOMs. See the BOM* constants for details.
func BOMMode(mode int) Option {
	return func(l *Lexer) error {
		l.bomMode = mode
		return nil
	}
}

// ErrorFunc option sets a function called when an, for example I/O error,
// occurs.  The default is to call Error with the position and message already
// formated as a string.
func ErrorFunc(f func(token.Pos, string)) Option {
	return func(l *Lexer) error {
		l.errorf = f
		return nil
	}
}

// RuneClass option sets the function used to convert runes to character
// classes.
func RuneClass(f func(rune) int) Option {
	return func(l *Lexer) error {
		l.classf = f
		return nil
	}
}