123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. package tokenize
  2. import "fmt"
  3. import "strings"
  4. type reader struct {
  5. data string
  6. pos uint
  7. }
  8. func (self *reader) len() uint {
  9. return uint(len(self.data))
  10. }
  11. func (self *reader) checkPos() {
  12. if self.pos > uint(len(self.data)) {
  13. panic(fmt.Sprintf("invalid cursor state: pos=%d but len(data)=%d", self.pos, len(self.data)))
  14. }
  15. }
  16. func (self *reader) peekByte() (byte, bool) {
  17. self.checkPos()
  18. if self.done() {
  19. return 0, false
  20. }
  21. return self.data[self.pos], true
  22. }
  23. func (self *reader) takeChar() string {
  24. self.checkPos()
  25. self.pos += 1
  26. return self.data[self.pos-1 : self.pos]
  27. }
  28. func (self *reader) takeUntil(needle string) (string, bool) {
  29. self.checkPos()
  30. idx := strings.Index(self.data[self.pos:], needle)
  31. if idx == -1 {
  32. return "", false
  33. }
  34. end := self.pos + uint(idx)
  35. out := self.data[self.pos:end]
  36. self.pos = end
  37. return out, true
  38. }
  39. func (self *reader) tossUntilNeitherOf(needles string) bool {
  40. self.checkPos()
  41. match := func(b byte, ns string) bool {
  42. for i := range len(ns) {
  43. if b == ns[i] {
  44. return true
  45. }
  46. }
  47. return false
  48. }
  49. tossed := uint(0)
  50. for {
  51. idx := self.pos + tossed
  52. if idx == self.len() {
  53. break
  54. }
  55. if match(self.data[idx], needles) {
  56. tossed += 1
  57. continue
  58. }
  59. break
  60. }
  61. self.pos += tossed
  62. return tossed > 0
  63. }
  64. func (self *reader) tossChar() {
  65. self.checkPos()
  66. self.pos += 1
  67. }
  68. func (self *reader) done() bool {
  69. self.checkPos()
  70. return self.pos == self.len()
  71. }
  72. type builder struct {
  73. tokens []string
  74. buf_set bool
  75. buf_chars string
  76. }
  77. func make_builder() builder {
  78. return builder{tokens: make([]string, 0), buf_set: false, buf_chars: ""}
  79. }
  80. func (self *builder) bufAppend(chars string) {
  81. if self.buf_set {
  82. self.buf_chars += chars
  83. } else {
  84. self.buf_chars = chars
  85. self.buf_set = true
  86. }
  87. }
  88. func (self *builder) bufAppendChar(char byte) {
  89. if !self.buf_set {
  90. self.buf_chars = ""
  91. self.buf_set = true
  92. }
  93. self.buf_chars += string(char)
  94. }
  95. func (self *builder) bufCommit() {
  96. if !self.buf_set {
  97. return
  98. }
  99. self.tokens = append(self.tokens, self.buf_chars)
  100. self.buf_chars = ""
  101. self.buf_set = false
  102. }
  103. type tokenizeResult struct {
  104. tokens []string
  105. code tokenizeResultCode
  106. err_loc uint
  107. }
  108. type tokenizeResultCode uint8
  109. func (self tokenizeResultCode) String() string {
  110. switch self {
  111. case tokenizeResultCodeOk:
  112. return "tokenizeResultCodeOk"
  113. case tokenizeResultCodeMissingEndSingleQuote:
  114. return "tokenizeResultCodeMissingEndSingleQuote"
  115. case tokenizeResultCodeMissingEndDoubleQuote:
  116. return "tokenizeResultCodeMissingEndDoubleQuote"
  117. case tokenizeResultCodeMissingEscapedCharacter:
  118. return "tokenizeResultCodeMissingEscapedCharacter"
  119. default:
  120. return fmt.Sprintf("unknown!tokenizeResultCode(%d)", self)
  121. }
  122. }
  123. const (
  124. tokenizeResultCodeOk tokenizeResultCode = iota
  125. tokenizeResultCodeMissingEndSingleQuote
  126. tokenizeResultCodeMissingEndDoubleQuote
  127. tokenizeResultCodeMissingEscapedCharacter
  128. )
  129. func tokenize(str string) tokenizeResult {
  130. rdr := reader{data: str}
  131. b := make_builder()
  132. for {
  133. this_byte, ok := rdr.peekByte()
  134. if !ok {
  135. break
  136. }
  137. switch this_byte {
  138. case ' ':
  139. b.bufCommit()
  140. rdr.tossUntilNeitherOf(" \t\n")
  141. case '\t':
  142. b.bufCommit()
  143. rdr.tossUntilNeitherOf(" \t\n")
  144. case '\n':
  145. b.bufCommit()
  146. rdr.tossUntilNeitherOf(" \t\n")
  147. case '\'':
  148. rdr.tossChar() // first `'`
  149. new_chars, found := rdr.takeUntil("'")
  150. if !found {
  151. return tokenizeResult{code: tokenizeResultCodeMissingEndSingleQuote, err_loc: rdr.pos - 1}
  152. }
  153. b.bufAppend(new_chars)
  154. rdr.tossChar() // the second `'`
  155. case '"':
  156. rdr.tossChar() // first `"`
  157. new_chars, found := rdr.takeUntil("\"")
  158. if !found {
  159. return tokenizeResult{code: tokenizeResultCodeMissingEndDoubleQuote, err_loc: rdr.pos - 1}
  160. }
  161. b.bufAppend(new_chars)
  162. rdr.tossChar() // the second `"`
  163. case '\\':
  164. rdr.tossChar()
  165. if rdr.done() {
  166. return tokenizeResult{code: tokenizeResultCodeMissingEscapedCharacter, err_loc: rdr.pos - 1}
  167. }
  168. new_chars := rdr.takeChar()
  169. b.bufAppend(new_chars)
  170. default:
  171. b.bufAppendChar(this_byte)
  172. rdr.tossChar()
  173. }
  174. }
  175. b.bufCommit()
  176. return tokenizeResult{tokens: b.tokens}
  177. }
  178. func Tokenize(str string) ([]string, error) {
  179. res := tokenize(str)
  180. switch res.code {
  181. case tokenizeResultCodeOk:
  182. return res.tokens, nil
  183. case tokenizeResultCodeMissingEndSingleQuote:
  184. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndSingleQuote, loc: res.err_loc}
  185. case tokenizeResultCodeMissingEndDoubleQuote:
  186. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndDoubleQuote, loc: res.err_loc}
  187. case tokenizeResultCodeMissingEscapedCharacter:
  188. return nil, TokenizeError{code: TokenizeErrorCodeMissingEscapedCharacter, loc: res.err_loc}
  189. default:
  190. return nil, TokenizeError{code: TokenizeErrorCodeGeneral, loc: res.err_loc}
  191. }
  192. }
  193. type TokenizeError struct {
  194. code TokenizeErrorCode
  195. loc uint
  196. }
  197. type TokenizeErrorCode uint8
  198. const (
  199. TokenizeErrorCodeGeneral TokenizeErrorCode = iota
  200. TokenizeErrorCodeMissingEndSingleQuote
  201. TokenizeErrorCodeMissingEndDoubleQuote
  202. TokenizeErrorCodeMissingEscapedCharacter
  203. )
  204. func (e TokenizeError) Error() string {
  205. switch e.code {
  206. case TokenizeErrorCodeMissingEndSingleQuote:
  207. return fmt.Sprintf("unterminated single-quote: at %d", e.loc)
  208. case TokenizeErrorCodeMissingEndDoubleQuote:
  209. return fmt.Sprintf("unterminated double-quote: at %d", e.loc)
  210. case TokenizeErrorCodeMissingEscapedCharacter:
  211. return fmt.Sprintf("missing escaped character: at %d", e.loc)
  212. default:
  213. return fmt.Sprintf("unknown TokenizeError code: .code=%d .loc=%d", e.code, e.loc)
  214. }
  215. }