package tokenize import "fmt" import "strings" type reader struct { data string pos uint } func (self *reader) len() uint { return uint(len(self.data)) } func (self *reader) checkPos() { if self.pos > uint(len(self.data)) { panic(fmt.Sprintf("invalid cursor state: pos=%d but len(data)=%d", self.pos, len(self.data))) } } func (self *reader) peekChar() (byte, bool) { self.checkPos() if self.done() { return 0, false } return self.data[self.pos], true } func (self *reader) takeChar() (string, bool) { self.checkPos() if self.pos == self.len() { return "", false } self.pos += 1 return self.data[self.pos-1 : self.pos], true } func (self *reader) takeUntil(needle string) (string, bool) { self.checkPos() idx := strings.Index(self.data[self.pos:], needle) if idx == -1 { return "", false } end := self.pos + uint(idx) out := self.data[self.pos:end] self.pos = end return out, true } func (self *reader) tossUntilNeitherOf(needles string) bool { self.checkPos() tossed := uint(0) for { idx := self.pos + tossed if idx == self.len() { break } if hasByte(self.data[idx], needles) { tossed += 1 continue } break } self.pos += tossed return tossed > 0 } func hasByte(b byte, ns string) bool { for i := range len(ns) { if b == ns[i] { return true } } return false } func (self *reader) tossChar() { self.checkPos() self.pos += 1 } func (self *reader) done() bool { self.checkPos() return self.pos == self.len() } type builder struct { tokens []string buf_set bool buf_chars string } func make_builder() builder { return builder{tokens: make([]string, 0), buf_set: false, buf_chars: ""} } func (self *builder) bufAppend(chars string) { if self.buf_set { self.buf_chars += chars } else { self.buf_chars = chars self.buf_set = true } } func (self *builder) bufAppendChar(char byte) { if !self.buf_set { self.buf_chars = "" self.buf_set = true } self.buf_chars += string(char) } func (self *builder) bufCommit() { if !self.buf_set { return } self.tokens = append(self.tokens, self.buf_chars) self.buf_chars = "" self.buf_set = false } type tokenizeResult struct { tokens []string code tokenizeResultCode err_loc uint } type tokenizeResultCode uint8 func (self tokenizeResultCode) String() string { switch self { case tokenizeResultCodeOk: return "tokenizeResultCodeOk" case tokenizeResultCodeMissingEndSingleQuote: return "tokenizeResultCodeMissingEndSingleQuote" case tokenizeResultCodeMissingEndDoubleQuote: return "tokenizeResultCodeMissingEndDoubleQuote" case tokenizeResultCodeMissingEscapedCharacter: return "tokenizeResultCodeMissingEscapedCharacter" default: return fmt.Sprintf("unknown!tokenizeResultCode(%d)", self) } } const ( tokenizeResultCodeOk tokenizeResultCode = iota tokenizeResultCodeMissingEndSingleQuote tokenizeResultCodeMissingEndDoubleQuote tokenizeResultCodeMissingEscapedCharacter ) func tokenize(str string) tokenizeResult { rdr := reader{data: str} b := make_builder() for { this_char, ok := rdr.peekChar() if !ok { break } switch this_char { case ' ': b.bufCommit() rdr.tossUntilNeitherOf(" \t\n") case '\t': b.bufCommit() rdr.tossUntilNeitherOf(" \t\n") case '\n': b.bufCommit() rdr.tossUntilNeitherOf(" \t\n") case '\'': rdr.tossChar() // first `'` new_chars, found := rdr.takeUntil("'") if !found { return tokenizeResult{code: tokenizeResultCodeMissingEndSingleQuote, err_loc: rdr.pos - 1} } b.bufAppend(new_chars) rdr.tossChar() // the second `'` case '"': rdr.tossChar() // first `"` new_chars, found := rdr.takeUntil("\"") if !found { return tokenizeResult{code: tokenizeResultCodeMissingEndDoubleQuote, err_loc: rdr.pos - 1} } b.bufAppend(new_chars) rdr.tossChar() // the second `"` case '\\': rdr.tossChar() if rdr.done() { return tokenizeResult{code: tokenizeResultCodeMissingEscapedCharacter, err_loc: rdr.pos - 1} } new_chars, _ := rdr.takeChar() b.bufAppend(new_chars) default: b.bufAppendChar(this_char) rdr.tossChar() } } b.bufCommit() return tokenizeResult{tokens: b.tokens} } func Tokenize(str string) ([]string, error) { res := tokenize(str) switch res.code { case tokenizeResultCodeOk: return res.tokens, nil case tokenizeResultCodeMissingEndSingleQuote: return nil, TokenizeError{code: TokenizeErrorCodeMissingEndSingleQuote, loc: res.err_loc} case tokenizeResultCodeMissingEndDoubleQuote: return nil, TokenizeError{code: TokenizeErrorCodeMissingEndDoubleQuote, loc: res.err_loc} case tokenizeResultCodeMissingEscapedCharacter: return nil, TokenizeError{code: TokenizeErrorCodeMissingEscapedCharacter, loc: res.err_loc} default: return nil, TokenizeError{code: TokenizeErrorCodeGeneral, loc: res.err_loc} } } type TokenizeError struct { code TokenizeErrorCode loc uint } type TokenizeErrorCode uint8 const ( TokenizeErrorCodeGeneral TokenizeErrorCode = iota TokenizeErrorCodeMissingEndSingleQuote TokenizeErrorCodeMissingEndDoubleQuote TokenizeErrorCodeMissingEscapedCharacter ) func (e TokenizeError) Error() string { switch e.code { case TokenizeErrorCodeMissingEndSingleQuote: return fmt.Sprintf("unterminated single-quote: at %d", e.loc) case TokenizeErrorCodeMissingEndDoubleQuote: return fmt.Sprintf("unterminated double-quote: at %d", e.loc) case TokenizeErrorCodeMissingEscapedCharacter: return fmt.Sprintf("missing escaped character: at %d", e.loc) default: return fmt.Sprintf("unknown TokenizeError code: .code=%d .loc=%d", e.code, e.loc) } }