tokenize.go 4.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. package tokenize
  2. import (
  3. "fmt"
  4. "strings"
  5. )
  6. type reader struct {
  7. data string
  8. pos uint
  9. }
  10. func (self *reader) len() uint {
  11. return uint(len(self.data))
  12. }
  13. func (self *reader) checkPos() {
  14. if self.pos > uint(len(self.data)) {
  15. panic(fmt.Sprintf("invalid cursor state: pos=%d but len(data)=%d", self.pos, len(self.data)))
  16. }
  17. }
  18. func (self *reader) peekByte() (byte, bool) {
  19. self.checkPos()
  20. if self.done() {
  21. return 0, false
  22. }
  23. return self.data[self.pos], true
  24. }
  25. func (self *reader) takeUntil(needle string) (string, bool) {
  26. self.checkPos()
  27. idx := strings.Index(self.data[self.pos:], needle)
  28. if idx == -1 {
  29. return "", false
  30. }
  31. end := self.pos + uint(idx)
  32. out := self.data[self.pos:end]
  33. self.pos = end
  34. return out, true
  35. }
  36. func (self *reader) tossUntilNeitherOf(needles string) bool {
  37. self.checkPos()
  38. match := func(b byte, ns string) bool {
  39. for i := range len(ns) {
  40. if b == ns[i] {
  41. return true
  42. }
  43. }
  44. return false
  45. }
  46. tossed := uint(0)
  47. for {
  48. idx := self.pos + tossed
  49. if idx == self.len() {
  50. break
  51. }
  52. if match(self.data[idx], needles) {
  53. tossed += 1
  54. continue
  55. }
  56. break
  57. }
  58. self.pos += tossed
  59. return tossed > 0
  60. }
  61. func (self *reader) tossChar() {
  62. self.checkPos()
  63. self.pos += 1
  64. }
  65. func (self *reader) done() bool {
  66. self.checkPos()
  67. return self.pos == self.len()
  68. }
  69. type builder struct {
  70. tokens []string
  71. buf_set bool
  72. buf_chars string
  73. }
  74. func make_builder() builder {
  75. return builder{tokens: make([]string, 0), buf_set: false, buf_chars: ""}
  76. }
  77. func (self *builder) bufAppend(chars string) {
  78. if self.buf_set {
  79. self.buf_chars += chars
  80. } else {
  81. self.buf_chars = chars
  82. self.buf_set = true
  83. }
  84. }
  85. func (self *builder) bufAppendChar(char byte) {
  86. if !self.buf_set {
  87. self.buf_chars = ""
  88. self.buf_set = true
  89. }
  90. self.buf_chars += string(char)
  91. }
  92. func (self *builder) bufCommit() {
  93. if !self.buf_set {
  94. return
  95. }
  96. self.tokens = append(self.tokens, self.buf_chars)
  97. self.buf_chars = ""
  98. self.buf_set = false
  99. }
  100. type tokenizeResult struct {
  101. tokens []string
  102. code tokenizeResultCode
  103. err_loc uint
  104. }
  105. type tokenizeResultCode uint8
  106. func (self tokenizeResultCode) String() string {
  107. switch self {
  108. case tokenizeResultCodeOk:
  109. return "tokenizeResultCodeOk"
  110. case tokenizeResultCodeMissingEndSingleQuote:
  111. return "tokenizeResultCodeMissingEndSingleQuote"
  112. case tokenizeResultCodeMissingEndDoubleQuote:
  113. return "tokenizeResultCodeMissingEndDoubleQuote"
  114. default:
  115. return fmt.Sprintf("unknown!tokenizeResultCode(%d)", self)
  116. }
  117. }
  118. const (
  119. tokenizeResultCodeOk tokenizeResultCode = iota
  120. tokenizeResultCodeMissingEndSingleQuote
  121. tokenizeResultCodeMissingEndDoubleQuote
  122. )
  123. func tokenize(str string) tokenizeResult {
  124. rdr := reader{data: str}
  125. b := make_builder()
  126. for {
  127. this_byte, ok := rdr.peekByte()
  128. if !ok {
  129. break
  130. }
  131. switch this_byte {
  132. case ' ':
  133. b.bufCommit()
  134. rdr.tossUntilNeitherOf(" \t\n")
  135. case '\t':
  136. b.bufCommit()
  137. rdr.tossUntilNeitherOf(" \t\n")
  138. case '\n':
  139. b.bufCommit()
  140. rdr.tossUntilNeitherOf(" \t\n")
  141. case '\'':
  142. rdr.tossChar() // first `'`
  143. new_chars, found := rdr.takeUntil("'")
  144. if !found {
  145. return tokenizeResult{code: tokenizeResultCodeMissingEndSingleQuote, err_loc: rdr.pos - 1}
  146. }
  147. b.bufAppend(new_chars)
  148. rdr.tossChar() // the second `'`
  149. case '"':
  150. rdr.tossChar() // first `"`
  151. new_chars, found := rdr.takeUntil("\"")
  152. if !found {
  153. return tokenizeResult{code: tokenizeResultCodeMissingEndDoubleQuote, err_loc: rdr.pos - 1}
  154. }
  155. b.bufAppend(new_chars)
  156. rdr.tossChar() // the second `"`
  157. default:
  158. b.bufAppendChar(this_byte)
  159. rdr.tossChar()
  160. }
  161. }
  162. b.bufCommit()
  163. return tokenizeResult{tokens: b.tokens}
  164. }
  165. func Tokenize(str string) ([]string, error) {
  166. res := tokenize(str)
  167. switch res.code {
  168. case tokenizeResultCodeOk:
  169. return res.tokens, nil
  170. case tokenizeResultCodeMissingEndSingleQuote:
  171. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndSingleQuote, loc: res.err_loc}
  172. case tokenizeResultCodeMissingEndDoubleQuote:
  173. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndDoubleQuote, loc: res.err_loc}
  174. default:
  175. return nil, TokenizeError{code: TokenizeErrorCodeGeneral, loc: res.err_loc}
  176. }
  177. }
  178. type TokenizeError struct {
  179. code TokenizeErrorCode
  180. loc uint
  181. }
  182. type TokenizeErrorCode uint8
  183. const (
  184. TokenizeErrorCodeGeneral TokenizeErrorCode = iota
  185. TokenizeErrorCodeMissingEndSingleQuote
  186. TokenizeErrorCodeMissingEndDoubleQuote
  187. )
  188. func (e TokenizeError) Error() string {
  189. switch e.code {
  190. case TokenizeErrorCodeMissingEndSingleQuote:
  191. return fmt.Sprintf("unterminated single-quote: at %d", e.loc)
  192. case TokenizeErrorCodeMissingEndDoubleQuote:
  193. return fmt.Sprintf("unterminated double-quote: at %d", e.loc)
  194. default:
  195. return fmt.Sprintf("unknown TokenizeError code: .code=%d .loc=%d", e.code, e.loc)
  196. }
  197. }