| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321 |
- package tokenize
-
- import "fmt"
- import "strings"
-
- type reader struct {
- data string
- pos uint
- }
-
- func (self *reader) len() uint {
- return uint(len(self.data))
- }
-
- func (self *reader) checkPos() {
- if self.pos > uint(len(self.data)) {
- panic(fmt.Sprintf("invalid cursor state: pos=%d but len(data)=%d", self.pos, len(self.data)))
- }
- }
-
- func (self *reader) peekChar() (byte, bool) {
- self.checkPos()
- if self.done() {
- return 0, false
- }
- return self.data[self.pos], true
- }
-
- func (self *reader) takeChar() (string, bool) {
- self.checkPos()
- if self.pos == self.len() {
- return "", false
- }
- self.pos += 1
- return self.data[self.pos-1 : self.pos], true
- }
-
- func (self *reader) takeUntil(needle string) (string, bool) {
- self.checkPos()
- idx := strings.Index(self.data[self.pos:], needle)
- if idx == -1 {
- return "", false
- }
- end := self.pos + uint(idx)
- out := self.data[self.pos:end]
- self.pos = end
- return out, true
- }
-
- func (self *reader) takeUntilAnyOf(needles string) (string, bool) {
- self.checkPos()
- to_take := uint(0)
- found := false
- for {
- idx := self.pos + to_take
- if idx == self.len() {
- break
- }
- if hasByte(self.data[idx], needles) {
- break
- }
- to_take += 1
- found = true
- }
- end := self.pos + to_take
- out := self.data[self.pos:end]
- self.pos = end
- return out, found
- }
-
- func (self *reader) tossUntilNeitherOf(needles string) bool {
- self.checkPos()
- tossed := uint(0)
- for {
- idx := self.pos + tossed
- if idx == self.len() {
- break
- }
- if hasByte(self.data[idx], needles) {
- tossed += 1
- continue
- }
- break
- }
- self.pos += tossed
- return tossed > 0
- }
-
- func hasByte(b byte, ns string) bool {
- for i := range len(ns) {
- if b == ns[i] {
- return true
- }
- }
- return false
- }
-
- func (self *reader) tossChar() {
- self.checkPos()
- self.pos += 1
- }
-
- func (self *reader) tossChars(n uint) bool {
- self.checkPos()
- end := self.pos + n
- if end > self.len() {
- return false
- }
- self.pos = end
- return true
- }
-
- func (self *reader) done() bool {
- self.checkPos()
- return self.pos == self.len()
- }
-
- type builder struct {
- tokens []string
- buf_set bool
- buf_chars string
- }
-
- func make_builder() builder {
- return builder{tokens: make([]string, 0), buf_set: false, buf_chars: ""}
- }
-
- func (self *builder) bufAppend(chars string) {
- if self.buf_set {
- self.buf_chars += chars
- } else {
- self.buf_chars = chars
- self.buf_set = true
- }
- }
-
- func (self *builder) bufAppendChar(char byte) {
- if !self.buf_set {
- self.buf_chars = ""
- self.buf_set = true
- }
- self.buf_chars += string(char)
- }
-
- func (self *builder) bufCommit() {
- if !self.buf_set {
- return
- }
- self.tokens = append(self.tokens, self.buf_chars)
- self.buf_chars = ""
- self.buf_set = false
- }
-
- type tokenizeResult struct {
- tokens []string
- code tokenizeResultCode
- err_loc uint
- }
-
- type tokenizeResultCode uint8
-
- func (self tokenizeResultCode) String() string {
- switch self {
- case tokenizeResultCodeOk:
- return "tokenizeResultCodeOk"
- case tokenizeResultCodeMissingEndSingleQuote:
- return "tokenizeResultCodeMissingEndSingleQuote"
- case tokenizeResultCodeMissingEndDoubleQuote:
- return "tokenizeResultCodeMissingEndDoubleQuote"
- case tokenizeResultCodeMissingEscapedCharacter:
- return "tokenizeResultCodeMissingEscapedCharacter"
- default:
- return fmt.Sprintf("unknown!tokenizeResultCode(%d)", self)
- }
- }
-
- const (
- tokenizeResultCodeOk tokenizeResultCode = iota
- tokenizeResultCodeMissingEndSingleQuote
- tokenizeResultCodeMissingEndDoubleQuote
- tokenizeResultCodeMissingEscapedCharacter
- )
-
- func tokenize_dq(str string) (string, tokenizeResultCode, uint) {
- buf := ""
- rdr := reader{data: str}
- for {
- dq_char, ok := rdr.peekChar()
- if !ok {
- return "", tokenizeResultCodeMissingEndDoubleQuote, rdr.pos
- }
- switch dq_char {
- case '\\':
- rdr.tossChar()
- next_char, ok := rdr.peekChar()
- if !ok {
- return "", tokenizeResultCodeMissingEscapedCharacter, rdr.pos - 1
- }
- switch next_char {
- case '"':
- rdr.tossChar()
- buf = buf + string(next_char)
- case '\\':
- rdr.tossChar()
- buf = buf + string(next_char)
- default:
- buf = buf + "\\"
- }
- case '"':
- rdr.tossChar()
- return buf, tokenizeResultCodeOk, rdr.pos
- default:
- dq_chars, found := rdr.takeUntilAnyOf("\\\"")
- if !found {
- return "", tokenizeResultCodeMissingEndDoubleQuote, rdr.pos
- }
- buf = buf + dq_chars
- }
- }
-
- }
-
- func tokenize(str string) tokenizeResult {
- rdr := reader{data: str}
- b := make_builder()
- for {
- this_char, ok := rdr.peekChar()
- if !ok {
- break
- }
- switch this_char {
- case ' ':
- b.bufCommit()
- rdr.tossUntilNeitherOf(" \t\n")
- case '\t':
- b.bufCommit()
- rdr.tossUntilNeitherOf(" \t\n")
- case '\n':
- b.bufCommit()
- rdr.tossUntilNeitherOf(" \t\n")
- case '\'':
- rdr.tossChar() // first `'`
- new_chars, found := rdr.takeUntil("'")
- if !found {
- return tokenizeResult{code: tokenizeResultCodeMissingEndSingleQuote, err_loc: rdr.pos - 1}
- }
- b.bufAppend(new_chars)
- rdr.tossChar() // the second `'`
- case '"':
- rdr.tossChar() // first `"`
- dq_chars, code, dq_rdr_taken := tokenize_dq(rdr.data[rdr.pos:])
- switch code {
- case tokenizeResultCodeOk:
- ok := rdr.tossChars(dq_rdr_taken)
- if !ok {
- panic(fmt.Sprintf(
- "invalid tokenize_dq() result: claims to have taken %d chars but there were only %d",
- dq_rdr_taken, rdr.len()-rdr.pos,
- ))
- }
- b.bufAppend(dq_chars)
- default:
- return tokenizeResult{code: code, err_loc: dq_rdr_taken}
- }
- case '\\':
- rdr.tossChar()
- if rdr.done() {
- return tokenizeResult{code: tokenizeResultCodeMissingEscapedCharacter, err_loc: rdr.pos - 1}
- }
- new_chars, _ := rdr.takeChar()
- b.bufAppend(new_chars)
- default:
- b.bufAppendChar(this_char)
- rdr.tossChar()
- }
- }
- b.bufCommit()
- return tokenizeResult{tokens: b.tokens}
- }
-
- func Tokenize(str string) ([]string, error) {
- res := tokenize(str)
- switch res.code {
- case tokenizeResultCodeOk:
- return res.tokens, nil
- case tokenizeResultCodeMissingEndSingleQuote:
- return nil, TokenizeError{code: TokenizeErrorCodeMissingEndSingleQuote, loc: res.err_loc}
- case tokenizeResultCodeMissingEndDoubleQuote:
- return nil, TokenizeError{code: TokenizeErrorCodeMissingEndDoubleQuote, loc: res.err_loc}
- case tokenizeResultCodeMissingEscapedCharacter:
- return nil, TokenizeError{code: TokenizeErrorCodeMissingEscapedCharacter, loc: res.err_loc}
- default:
- return nil, TokenizeError{code: TokenizeErrorCodeGeneral, loc: res.err_loc}
- }
- }
-
- type TokenizeError struct {
- code TokenizeErrorCode
- loc uint
- }
- type TokenizeErrorCode uint8
-
- const (
- TokenizeErrorCodeGeneral TokenizeErrorCode = iota
- TokenizeErrorCodeMissingEndSingleQuote
- TokenizeErrorCodeMissingEndDoubleQuote
- TokenizeErrorCodeMissingEscapedCharacter
- )
-
- func (e TokenizeError) Error() string {
- switch e.code {
- case TokenizeErrorCodeMissingEndSingleQuote:
- return fmt.Sprintf("unterminated single-quote: at %d", e.loc)
- case TokenizeErrorCodeMissingEndDoubleQuote:
- return fmt.Sprintf("unterminated double-quote: at %d", e.loc)
- case TokenizeErrorCodeMissingEscapedCharacter:
- return fmt.Sprintf("missing escaped character: at %d", e.loc)
- default:
- return fmt.Sprintf("unknown TokenizeError code: .code=%d .loc=%d", e.code, e.loc)
- }
- }
|