You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
411 lines
10 KiB
411 lines
10 KiB
// Copyright (c) 2015 The golex Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package lex // import "modernc.org/golex/lex"
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"go/token"
|
|
"io"
|
|
"os"
|
|
)
|
|
|
|
// BOM handling modes which can be set by the BOMMode Option. Default is BOMIgnoreFirst.
|
|
const (
|
|
BOMError = iota // BOM is an error anywhere.
|
|
BOMIgnoreFirst // Skip BOM if at beginning, report as error if anywhere else.
|
|
BOMPassAll // No special handling of BOM.
|
|
BOMPassFirst // No special handling of BOM if at beginning, report as error if anywhere else.
|
|
)
|
|
|
|
const (
|
|
NonASCII = 0x80 // DefaultRuneClass returns NonASCII for non ASCII runes.
|
|
RuneEOF = -1 // Distinct from any valid Unicode rune value.
|
|
)
|
|
|
|
// DefaultRuneClass returns the character class of r. If r is an ASCII code
|
|
// then its class equals the ASCII code. Any other rune is of class NonASCII.
|
|
//
|
|
// DefaultRuneClass is the default implementation Lexer will use to convert
|
|
// runes (21 bit entities) to scanner classes (8 bit entities).
|
|
//
|
|
// Non ASCII aware lexical analyzers will typically use their own
|
|
// categorization function. To assign such custom function use the RuneClass
|
|
// option.
|
|
func DefaultRuneClass(r rune) int {
|
|
if r >= 0 && r < 0x80 {
|
|
return int(r)
|
|
}
|
|
|
|
return NonASCII
|
|
}
|
|
|
|
// Char represents a rune and its position.
|
|
type Char struct {
|
|
Rune rune
|
|
pos int32
|
|
}
|
|
|
|
// NewChar returns a new Char value.
|
|
func NewChar(pos token.Pos, r rune) Char { return Char{pos: int32(pos), Rune: r} }
|
|
|
|
// IsValid reports whether c is not a zero Char.
|
|
func (c Char) IsValid() bool { return c.Pos().IsValid() }
|
|
|
|
// Pos returns the token.Pos associated with c.
|
|
func (c Char) Pos() token.Pos { return token.Pos(c.pos) }
|
|
|
|
// CharReader is a RuneReader providing additionally explicit position
|
|
// information by returning a Char instead of a rune as its first result.
|
|
type CharReader interface {
|
|
ReadChar() (c Char, size int, err error)
|
|
}
|
|
|
|
// Lexer suports golex[0] generated lexical analyzers.
|
|
type Lexer struct {
|
|
File *token.File // The *token.File passed to New.
|
|
First Char // First remembers the lookahead char when Rule0 was invoked.
|
|
Last Char // Last remembers the last Char returned by Next.
|
|
Prev Char // Prev remembers the Char previous to Last.
|
|
bomMode int // See the BOM* constants.
|
|
bytesBuf bytes.Buffer // Used by TokenBytes.
|
|
charSrc CharReader // Lexer alternative input.
|
|
classf func(rune) int //
|
|
errorf func(token.Pos, string) //
|
|
lookahead Char // Lookahead if non zero.
|
|
mark int // Longest match marker.
|
|
off int // Used for File.AddLine.
|
|
src io.RuneReader // Lexer input.
|
|
tokenBuf []Char // Lexeme collector.
|
|
ungetBuf []Char // Unget buffer.
|
|
}
|
|
|
|
// New returns a new *Lexer. The result can be amended using opts.
|
|
//
|
|
// Non Unicode Input
|
|
//
|
|
// To consume sources in other encodings and still have exact position
|
|
// information, pass an io.RuneReader which returns the next input character
|
|
// reencoded as an Unicode rune but returns the size (number of bytes used to
|
|
// encode it) of the original character, not the size of its UTF-8
|
|
// representation after converted to an Unicode rune. Size is the second
|
|
// returned value of io.RuneReader.ReadRune method[4].
|
|
//
|
|
// When src optionally implements CharReader its ReadChar method is used
|
|
// instead of io.ReadRune.
|
|
func New(file *token.File, src io.RuneReader, opts ...Option) (*Lexer, error) {
|
|
r := &Lexer{
|
|
File: file,
|
|
bomMode: BOMIgnoreFirst,
|
|
classf: DefaultRuneClass,
|
|
src: src,
|
|
}
|
|
if x, ok := src.(CharReader); ok {
|
|
r.charSrc = x
|
|
}
|
|
r.errorf = r.defaultErrorf
|
|
for _, o := range opts {
|
|
if err := o(r); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return r, nil
|
|
}
|
|
|
|
// Abort handles the situation when the scanner does not successfully recognize
|
|
// any token or when an attempt to find the longest match "overruns" from an
|
|
// accepting state only to never reach an accepting state again. In the first
|
|
// case the scanner was never in an accepting state since last call to Rule0
|
|
// and then (true, previousLookahead rune) is returned, effectively consuming a
|
|
// single Char token, avoiding scanner stall. Otherwise there was at least one
|
|
// accepting scanner state marked using Mark. In this case Abort rollbacks the
|
|
// lexer state to the marked state and returns (false, 0). The scanner must
|
|
// then execute a prescribed goto statement. For example:
|
|
//
|
|
// %yyc c
|
|
// %yyn c = l.Next()
|
|
// %yym l.Mark()
|
|
//
|
|
// %{
|
|
// package foo
|
|
//
|
|
// import (...)
|
|
//
|
|
// type lexer struct {
|
|
// *lex.Lexer
|
|
// ...
|
|
// }
|
|
//
|
|
// func newLexer(...) *lexer {
|
|
// return &lexer{
|
|
// lex.NewLexer(...),
|
|
// ...
|
|
// }
|
|
// }
|
|
//
|
|
// func (l *lexer) scan() int {
|
|
// c := l.Enter()
|
|
// %}
|
|
//
|
|
// ... more lex defintions
|
|
//
|
|
// %%
|
|
//
|
|
// c = l.Rule0()
|
|
//
|
|
// ... lex rules
|
|
//
|
|
// %%
|
|
//
|
|
// if c, ok := l.Abort(); ok {
|
|
// return c
|
|
// }
|
|
//
|
|
// goto yyAction
|
|
// }
|
|
func (l *Lexer) Abort() (int, bool) {
|
|
if l.mark >= 0 {
|
|
if len(l.tokenBuf) > l.mark {
|
|
l.Unget(l.lookahead)
|
|
for i := len(l.tokenBuf) - 1; i >= l.mark; i-- {
|
|
l.Unget(l.tokenBuf[i])
|
|
}
|
|
}
|
|
l.tokenBuf = l.tokenBuf[:l.mark]
|
|
return 0, false
|
|
}
|
|
|
|
switch n := len(l.tokenBuf); n {
|
|
case 0: // [] z
|
|
c := l.lookahead
|
|
l.Next()
|
|
return int(c.Rune), true
|
|
case 1: // [a] z
|
|
return int(l.tokenBuf[0].Rune), true
|
|
default: // [a, b, ...], z
|
|
c := l.tokenBuf[0] // a
|
|
l.Unget(l.lookahead) // z
|
|
for i := n - 1; i > 1; i-- {
|
|
l.Unget(l.tokenBuf[i]) // ...
|
|
}
|
|
l.lookahead = l.tokenBuf[1] // b
|
|
l.tokenBuf = l.tokenBuf[:1]
|
|
return int(c.Rune), true
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) class() int { return l.classf(l.lookahead.Rune) }
|
|
|
|
func (l *Lexer) defaultErrorf(pos token.Pos, msg string) {
|
|
l.Error(fmt.Sprintf("%v: %v", l.File.Position(pos), msg))
|
|
}
|
|
|
|
// Enter ensures the lexer has a valid lookahead Char and returns its class.
|
|
// Typical use in an .l file
|
|
//
|
|
// func (l *lexer) scan() lex.Char {
|
|
// c := l.Enter()
|
|
// ...
|
|
func (l *Lexer) Enter() int {
|
|
if !l.lookahead.IsValid() {
|
|
l.Next()
|
|
}
|
|
return l.class()
|
|
}
|
|
|
|
// Error Implements yyLexer[2] by printing the msg to stderr.
|
|
func (l *Lexer) Error(msg string) {
|
|
fmt.Fprintf(os.Stderr, "%s\n", msg)
|
|
}
|
|
|
|
// Lookahead returns the current lookahead.
|
|
func (l *Lexer) Lookahead() Char {
|
|
if !l.lookahead.IsValid() {
|
|
l.Next()
|
|
}
|
|
return l.lookahead
|
|
}
|
|
|
|
// Mark records the current state of scanner as accepting. It implements the
|
|
// golex macro %yym. Typical usage in an .l file:
|
|
//
|
|
// %yym l.Mark()
|
|
func (l *Lexer) Mark() { l.mark = len(l.tokenBuf) }
|
|
|
|
func (l *Lexer) next() int {
|
|
const bom = '\ufeff'
|
|
|
|
if c := l.lookahead; c.IsValid() {
|
|
l.tokenBuf = append(l.tokenBuf, c)
|
|
}
|
|
if n := len(l.ungetBuf); n != 0 {
|
|
l.lookahead = l.ungetBuf[n-1]
|
|
l.ungetBuf = l.ungetBuf[:n-1]
|
|
return l.class()
|
|
}
|
|
|
|
if l.src == nil {
|
|
return RuneEOF
|
|
}
|
|
|
|
var r rune
|
|
var sz int
|
|
var err error
|
|
var pos token.Pos
|
|
var c Char
|
|
again:
|
|
off0 := l.off
|
|
switch cs := l.charSrc; {
|
|
case cs != nil:
|
|
c, sz, err = cs.ReadChar()
|
|
r = c.Rune
|
|
pos = c.Pos()
|
|
default:
|
|
r, sz, err = l.src.ReadRune()
|
|
pos = l.File.Pos(l.off)
|
|
}
|
|
l.off += sz
|
|
if err != nil {
|
|
l.src = nil
|
|
r = RuneEOF
|
|
if err != io.EOF {
|
|
l.errorf(pos, err.Error())
|
|
}
|
|
}
|
|
|
|
if r == bom {
|
|
switch l.bomMode {
|
|
default:
|
|
fallthrough
|
|
case BOMIgnoreFirst:
|
|
if off0 != 0 {
|
|
l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
|
|
}
|
|
goto again
|
|
case BOMPassAll:
|
|
// nop
|
|
case BOMPassFirst:
|
|
if off0 != 0 {
|
|
l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
|
|
goto again
|
|
}
|
|
case BOMError:
|
|
switch {
|
|
case off0 == 0:
|
|
l.errorf(pos, "unicode (UTF-8) BOM at beginnig of file")
|
|
default:
|
|
l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
|
|
}
|
|
goto again
|
|
}
|
|
}
|
|
|
|
l.lookahead = NewChar(pos, r)
|
|
if r == '\n' {
|
|
l.File.AddLine(l.off)
|
|
}
|
|
return l.class()
|
|
}
|
|
|
|
// Next advances the scanner for one rune and returns the respective character
|
|
// class of the new lookahead. Typical usage in an .l file:
|
|
//
|
|
// %yyn c = l.Next()
|
|
func (l *Lexer) Next() int {
|
|
l.Prev = l.Last
|
|
r := l.next()
|
|
l.Last = l.lookahead
|
|
return r
|
|
}
|
|
|
|
// Offset returns the current reading offset of the lexer's source.
|
|
func (l *Lexer) Offset() int { return l.off }
|
|
|
|
// Rule0 initializes the scanner state before the attempt to recognize a token
|
|
// starts. The token collecting buffer is cleared. Rule0 records the current
|
|
// lookahead in l.First and returns its class. Typical usage in an .l file:
|
|
//
|
|
// ... lex definitions
|
|
//
|
|
// %%
|
|
//
|
|
// c := l.Rule0()
|
|
//
|
|
// first-pattern-regexp
|
|
func (l *Lexer) Rule0() int {
|
|
if !l.lookahead.IsValid() {
|
|
l.Next()
|
|
}
|
|
l.First = l.lookahead
|
|
l.mark = -1
|
|
if len(l.tokenBuf) > 1<<18 { //DONE constant tuned
|
|
l.tokenBuf = nil
|
|
} else {
|
|
l.tokenBuf = l.tokenBuf[:0]
|
|
}
|
|
return l.class()
|
|
}
|
|
|
|
// Token returns the currently collected token chars. The result is R/O.
|
|
func (l *Lexer) Token() []Char { return l.tokenBuf }
|
|
|
|
// TokenBytes returns the UTF-8 encoding of Token. If builder is not nil then
|
|
// it's called instead to build the encoded token byte value into the buffer
|
|
// passed to it.
|
|
//
|
|
// The Result is R/O.
|
|
func (l *Lexer) TokenBytes(builder func(*bytes.Buffer)) []byte {
|
|
if len(l.bytesBuf.Bytes()) < 1<<18 { //DONE constant tuned
|
|
l.bytesBuf.Reset()
|
|
} else {
|
|
l.bytesBuf = bytes.Buffer{}
|
|
}
|
|
switch {
|
|
case builder != nil:
|
|
builder(&l.bytesBuf)
|
|
default:
|
|
for _, c := range l.Token() {
|
|
l.bytesBuf.WriteRune(c.Rune)
|
|
}
|
|
}
|
|
return l.bytesBuf.Bytes()
|
|
}
|
|
|
|
// Unget unreads all chars in c.
|
|
func (l *Lexer) Unget(c ...Char) {
|
|
l.ungetBuf = append(l.ungetBuf, c...)
|
|
l.lookahead = Char{} // Must invalidate lookahead.
|
|
}
|
|
|
|
// Option is a function which can be passed as an optional argument to New.
|
|
type Option func(*Lexer) error
|
|
|
|
// BOMMode option selects how the lexer handles BOMs. See the BOM* constants for details.
|
|
func BOMMode(mode int) Option {
|
|
return func(l *Lexer) error {
|
|
l.bomMode = mode
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// ErrorFunc option sets a function called when an, for example I/O error,
|
|
// occurs. The default is to call Error with the position and message already
|
|
// formated as a string.
|
|
func ErrorFunc(f func(token.Pos, string)) Option {
|
|
return func(l *Lexer) error {
|
|
l.errorf = f
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// RuneClass option sets the function used to convert runes to character
|
|
// classes.
|
|
func RuneClass(f func(rune) int) Option {
|
|
return func(l *Lexer) error {
|
|
l.classf = f
|
|
return nil
|
|
}
|
|
}
|