123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. package tokenize
  2. import "fmt"
  3. import "strings"
  4. type reader struct {
  5. data string
  6. pos uint
  7. }
  8. func (self *reader) len() uint {
  9. return uint(len(self.data))
  10. }
  11. func (self *reader) checkPos() {
  12. if self.pos > uint(len(self.data)) {
  13. panic(fmt.Sprintf("invalid cursor state: pos=%d but len(data)=%d", self.pos, len(self.data)))
  14. }
  15. }
  16. func (self *reader) peekByte() (byte, bool) {
  17. self.checkPos()
  18. if self.done() {
  19. return 0, false
  20. }
  21. return self.data[self.pos], true
  22. }
  23. func (self *reader) takeUntil(needle string) (string, bool) {
  24. self.checkPos()
  25. idx := strings.Index(self.data[self.pos:], needle)
  26. if idx == -1 {
  27. return "", false
  28. }
  29. end := self.pos + uint(idx)
  30. out := self.data[self.pos:end]
  31. self.pos = end
  32. return out, true
  33. }
  34. func (self *reader) tossUntilNeitherOf(needles string) bool {
  35. self.checkPos()
  36. match := func(b byte, ns string) bool {
  37. for i := range len(ns) {
  38. if b == ns[i] {
  39. return true
  40. }
  41. }
  42. return false
  43. }
  44. tossed := uint(0)
  45. for {
  46. idx := self.pos + tossed
  47. if idx == self.len() {
  48. break
  49. }
  50. if match(self.data[idx], needles) {
  51. tossed += 1
  52. continue
  53. }
  54. break
  55. }
  56. self.pos += tossed
  57. return tossed > 0
  58. }
  59. func (self *reader) tossChar() {
  60. self.checkPos()
  61. self.pos += 1
  62. }
  63. func (self *reader) done() bool {
  64. self.checkPos()
  65. return self.pos == self.len()
  66. }
  67. type builder struct {
  68. tokens []string
  69. buf_set bool
  70. buf_chars string
  71. }
  72. func make_builder() builder {
  73. return builder{tokens: make([]string, 0), buf_set: false, buf_chars: ""}
  74. }
  75. func (self *builder) bufAppend(chars string) {
  76. if self.buf_set {
  77. self.buf_chars += chars
  78. } else {
  79. self.buf_chars = chars
  80. self.buf_set = true
  81. }
  82. }
  83. func (self *builder) bufAppendChar(char byte) {
  84. if !self.buf_set {
  85. self.buf_chars = ""
  86. self.buf_set = true
  87. }
  88. self.buf_chars += string(char)
  89. }
  90. func (self *builder) bufCommit() {
  91. if !self.buf_set {
  92. return
  93. }
  94. self.tokens = append(self.tokens, self.buf_chars)
  95. self.buf_chars = ""
  96. self.buf_set = false
  97. }
  98. type tokenizeResult struct {
  99. tokens []string
  100. code tokenizeResultCode
  101. err_loc uint
  102. }
  103. type tokenizeResultCode uint8
  104. func (self tokenizeResultCode) String() string {
  105. switch self {
  106. case tokenizeResultCodeOk:
  107. return "tokenizeResultCodeOk"
  108. case tokenizeResultCodeMissingEndSingleQuote:
  109. return "tokenizeResultCodeMissingEndSingleQuote"
  110. case tokenizeResultCodeMissingEndDoubleQuote:
  111. return "tokenizeResultCodeMissingEndDoubleQuote"
  112. default:
  113. return fmt.Sprintf("unknown!tokenizeResultCode(%d)", self)
  114. }
  115. }
  116. const (
  117. tokenizeResultCodeOk tokenizeResultCode = iota
  118. tokenizeResultCodeMissingEndSingleQuote
  119. tokenizeResultCodeMissingEndDoubleQuote
  120. )
  121. func tokenize(str string) tokenizeResult {
  122. rdr := reader{data: str}
  123. b := make_builder()
  124. for {
  125. this_byte, ok := rdr.peekByte()
  126. if !ok {
  127. break
  128. }
  129. switch this_byte {
  130. case ' ':
  131. b.bufCommit()
  132. rdr.tossUntilNeitherOf(" \t\n")
  133. case '\t':
  134. b.bufCommit()
  135. rdr.tossUntilNeitherOf(" \t\n")
  136. case '\n':
  137. b.bufCommit()
  138. rdr.tossUntilNeitherOf(" \t\n")
  139. case '\'':
  140. rdr.tossChar() // first `'`
  141. new_chars, found := rdr.takeUntil("'")
  142. if !found {
  143. return tokenizeResult{code: tokenizeResultCodeMissingEndSingleQuote, err_loc: rdr.pos - 1}
  144. }
  145. b.bufAppend(new_chars)
  146. rdr.tossChar() // the second `'`
  147. case '"':
  148. rdr.tossChar() // first `"`
  149. new_chars, found := rdr.takeUntil("\"")
  150. if !found {
  151. return tokenizeResult{code: tokenizeResultCodeMissingEndDoubleQuote, err_loc: rdr.pos - 1}
  152. }
  153. b.bufAppend(new_chars)
  154. rdr.tossChar() // the second `"`
  155. default:
  156. b.bufAppendChar(this_byte)
  157. rdr.tossChar()
  158. }
  159. }
  160. b.bufCommit()
  161. return tokenizeResult{tokens: b.tokens}
  162. }
  163. func Tokenize(str string) ([]string, error) {
  164. res := tokenize(str)
  165. switch res.code {
  166. case tokenizeResultCodeOk:
  167. return res.tokens, nil
  168. case tokenizeResultCodeMissingEndSingleQuote:
  169. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndSingleQuote, loc: res.err_loc}
  170. case tokenizeResultCodeMissingEndDoubleQuote:
  171. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndDoubleQuote, loc: res.err_loc}
  172. default:
  173. return nil, TokenizeError{code: TokenizeErrorCodeGeneral, loc: res.err_loc}
  174. }
  175. }
  176. type TokenizeError struct {
  177. code TokenizeErrorCode
  178. loc uint
  179. }
  180. type TokenizeErrorCode uint8
  181. const (
  182. TokenizeErrorCodeGeneral TokenizeErrorCode = iota
  183. TokenizeErrorCodeMissingEndSingleQuote
  184. TokenizeErrorCodeMissingEndDoubleQuote
  185. )
  186. func (e TokenizeError) Error() string {
  187. switch e.code {
  188. case TokenizeErrorCodeMissingEndSingleQuote:
  189. return fmt.Sprintf("unterminated single-quote: at %d", e.loc)
  190. case TokenizeErrorCodeMissingEndDoubleQuote:
  191. return fmt.Sprintf("unterminated double-quote: at %d", e.loc)
  192. default:
  193. return fmt.Sprintf("unknown TokenizeError code: .code=%d .loc=%d", e.code, e.loc)
  194. }
  195. }