| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220 |
- package tokenize
-
- import (
- "fmt"
- "strings"
- )
-
- type reader struct {
- data string
- pos uint
- }
-
- func (self *reader) len() uint {
- return uint(len(self.data))
- }
-
- func (self *reader) checkPos() {
- if self.pos > uint(len(self.data)) {
- panic(fmt.Sprintf("invalid cursor state: pos=%d but len(data)=%d", self.pos, len(self.data)))
- }
- }
-
- func (self *reader) peekByte() (byte, bool) {
- self.checkPos()
- if self.done() {
- return 0, false
- }
- return self.data[self.pos], true
- }
-
- func (self *reader) takeUntil(needle string) (string, bool) {
- self.checkPos()
- idx := strings.Index(self.data[self.pos:], needle)
- if idx == -1 {
- return "", false
- }
- end := self.pos + uint(idx)
- out := self.data[self.pos:end]
- self.pos = end
- return out, true
- }
-
- func (self *reader) tossUntilNeitherOf(needles string) bool {
- self.checkPos()
- match := func(b byte, ns string) bool {
- for i := range len(ns) {
- if b == ns[i] {
- return true
- }
- }
- return false
- }
- tossed := uint(0)
- for {
- idx := self.pos + tossed
- if idx == self.len() {
- break
- }
- if match(self.data[idx], needles) {
- tossed += 1
- continue
- }
- break
- }
- self.pos += tossed
- return tossed > 0
- }
-
- func (self *reader) tossChar() {
- self.checkPos()
- self.pos += 1
- }
-
- func (self *reader) done() bool {
- self.checkPos()
- return self.pos == self.len()
- }
-
- type builder struct {
- tokens []string
- buf_set bool
- buf_chars string
- }
-
- func make_builder() builder {
- return builder{tokens: make([]string, 0), buf_set: false, buf_chars: ""}
- }
-
- func (self *builder) bufAppend(chars string) {
- if self.buf_set {
- self.buf_chars += chars
- } else {
- self.buf_chars = chars
- self.buf_set = true
- }
- }
-
- func (self *builder) bufAppendChar(char byte) {
- if !self.buf_set {
- self.buf_chars = ""
- self.buf_set = true
- }
- self.buf_chars += string(char)
- }
-
- func (self *builder) bufCommit() {
- if !self.buf_set {
- return
- }
- self.tokens = append(self.tokens, self.buf_chars)
- self.buf_chars = ""
- self.buf_set = false
- }
-
- type tokenizeResult struct {
- tokens []string
- code tokenizeResultCode
- err_loc uint
- }
-
- type tokenizeResultCode uint8
-
- func (self tokenizeResultCode) String() string {
- switch self {
- case tokenizeResultCodeOk:
- return "tokenizeResultCodeOk"
- case tokenizeResultCodeMissingEndSingleQuote:
- return "tokenizeResultCodeMissingEndSingleQuote"
- case tokenizeResultCodeMissingEndDoubleQuote:
- return "tokenizeResultCodeMissingEndDoubleQuote"
- default:
- return fmt.Sprintf("unknown!tokenizeResultCode(%d)", self)
- }
- }
-
- const (
- tokenizeResultCodeOk tokenizeResultCode = iota
- tokenizeResultCodeMissingEndSingleQuote
- tokenizeResultCodeMissingEndDoubleQuote
- )
-
- func tokenize(str string) tokenizeResult {
- rdr := reader{data: str}
- b := make_builder()
- for {
- this_byte, ok := rdr.peekByte()
- if !ok {
- break
- }
- switch this_byte {
- case ' ':
- b.bufCommit()
- rdr.tossUntilNeitherOf(" \t\n")
- case '\t':
- b.bufCommit()
- rdr.tossUntilNeitherOf(" \t\n")
- case '\n':
- b.bufCommit()
- rdr.tossUntilNeitherOf(" \t\n")
- case '\'':
- rdr.tossChar() // first `'`
- new_chars, found := rdr.takeUntil("'")
- if !found {
- return tokenizeResult{code: tokenizeResultCodeMissingEndSingleQuote, err_loc: rdr.pos - 1}
- }
- b.bufAppend(new_chars)
- rdr.tossChar() // the second `'`
- case '"':
- rdr.tossChar() // first `"`
- new_chars, found := rdr.takeUntil("\"")
- if !found {
- return tokenizeResult{code: tokenizeResultCodeMissingEndDoubleQuote, err_loc: rdr.pos - 1}
- }
- b.bufAppend(new_chars)
- rdr.tossChar() // the second `"`
- default:
- b.bufAppendChar(this_byte)
- rdr.tossChar()
- }
- }
- b.bufCommit()
- return tokenizeResult{tokens: b.tokens}
- }
-
- func Tokenize(str string) ([]string, error) {
- res := tokenize(str)
- switch res.code {
- case tokenizeResultCodeOk:
- return res.tokens, nil
- case tokenizeResultCodeMissingEndSingleQuote:
- return nil, TokenizeError{code: TokenizeErrorCodeMissingEndSingleQuote, loc: res.err_loc}
- case tokenizeResultCodeMissingEndDoubleQuote:
- return nil, TokenizeError{code: TokenizeErrorCodeMissingEndDoubleQuote, loc: res.err_loc}
- default:
- return nil, TokenizeError{code: TokenizeErrorCodeGeneral, loc: res.err_loc}
- }
- }
-
- type TokenizeError struct {
- code TokenizeErrorCode
- loc uint
- }
- type TokenizeErrorCode uint8
-
- const (
- TokenizeErrorCodeGeneral TokenizeErrorCode = iota
- TokenizeErrorCodeMissingEndSingleQuote
- TokenizeErrorCodeMissingEndDoubleQuote
- )
-
- func (e TokenizeError) Error() string {
- switch e.code {
- case TokenizeErrorCodeMissingEndSingleQuote:
- return fmt.Sprintf("unterminated single-quote: at %d", e.loc)
- case TokenizeErrorCodeMissingEndDoubleQuote:
- return fmt.Sprintf("unterminated double-quote: at %d", e.loc)
- default:
- return fmt.Sprintf("unknown TokenizeError code: .code=%d .loc=%d", e.code, e.loc)
- }
- }
|