tokenize.go 5.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. package tokenize
  2. import "fmt"
  3. import "strings"
  4. type reader struct {
  5. data string
  6. pos uint
  7. }
  8. func (self *reader) len() uint {
  9. return uint(len(self.data))
  10. }
  11. func (self *reader) checkPos() {
  12. if self.pos > uint(len(self.data)) {
  13. panic(fmt.Sprintf("invalid cursor state: pos=%d but len(data)=%d", self.pos, len(self.data)))
  14. }
  15. }
  16. func (self *reader) peekChar() (byte, bool) {
  17. self.checkPos()
  18. if self.done() {
  19. return 0, false
  20. }
  21. return self.data[self.pos], true
  22. }
  23. func (self *reader) takeChar() (string, bool) {
  24. self.checkPos()
  25. if self.pos == self.len() {
  26. return "", false
  27. }
  28. self.pos += 1
  29. return self.data[self.pos-1 : self.pos], true
  30. }
  31. func (self *reader) takeUntil(needle string) (string, bool) {
  32. self.checkPos()
  33. idx := strings.Index(self.data[self.pos:], needle)
  34. if idx == -1 {
  35. return "", false
  36. }
  37. end := self.pos + uint(idx)
  38. out := self.data[self.pos:end]
  39. self.pos = end
  40. return out, true
  41. }
  42. func (self *reader) tossUntilNeitherOf(needles string) bool {
  43. self.checkPos()
  44. tossed := uint(0)
  45. for {
  46. idx := self.pos + tossed
  47. if idx == self.len() {
  48. break
  49. }
  50. if hasByte(self.data[idx], needles) {
  51. tossed += 1
  52. continue
  53. }
  54. break
  55. }
  56. self.pos += tossed
  57. return tossed > 0
  58. }
  59. func hasByte(b byte, ns string) bool {
  60. for i := range len(ns) {
  61. if b == ns[i] {
  62. return true
  63. }
  64. }
  65. return false
  66. }
  67. func (self *reader) tossChar() {
  68. self.checkPos()
  69. self.pos += 1
  70. }
  71. func (self *reader) done() bool {
  72. self.checkPos()
  73. return self.pos == self.len()
  74. }
  75. type builder struct {
  76. tokens []string
  77. buf_set bool
  78. buf_chars string
  79. }
  80. func make_builder() builder {
  81. return builder{tokens: make([]string, 0), buf_set: false, buf_chars: ""}
  82. }
  83. func (self *builder) bufAppend(chars string) {
  84. if self.buf_set {
  85. self.buf_chars += chars
  86. } else {
  87. self.buf_chars = chars
  88. self.buf_set = true
  89. }
  90. }
  91. func (self *builder) bufAppendChar(char byte) {
  92. if !self.buf_set {
  93. self.buf_chars = ""
  94. self.buf_set = true
  95. }
  96. self.buf_chars += string(char)
  97. }
  98. func (self *builder) bufCommit() {
  99. if !self.buf_set {
  100. return
  101. }
  102. self.tokens = append(self.tokens, self.buf_chars)
  103. self.buf_chars = ""
  104. self.buf_set = false
  105. }
  106. type tokenizeResult struct {
  107. tokens []string
  108. code tokenizeResultCode
  109. err_loc uint
  110. }
  111. type tokenizeResultCode uint8
  112. func (self tokenizeResultCode) String() string {
  113. switch self {
  114. case tokenizeResultCodeOk:
  115. return "tokenizeResultCodeOk"
  116. case tokenizeResultCodeMissingEndSingleQuote:
  117. return "tokenizeResultCodeMissingEndSingleQuote"
  118. case tokenizeResultCodeMissingEndDoubleQuote:
  119. return "tokenizeResultCodeMissingEndDoubleQuote"
  120. case tokenizeResultCodeMissingEscapedCharacter:
  121. return "tokenizeResultCodeMissingEscapedCharacter"
  122. default:
  123. return fmt.Sprintf("unknown!tokenizeResultCode(%d)", self)
  124. }
  125. }
  126. const (
  127. tokenizeResultCodeOk tokenizeResultCode = iota
  128. tokenizeResultCodeMissingEndSingleQuote
  129. tokenizeResultCodeMissingEndDoubleQuote
  130. tokenizeResultCodeMissingEscapedCharacter
  131. )
  132. func tokenize(str string) tokenizeResult {
  133. rdr := reader{data: str}
  134. b := make_builder()
  135. for {
  136. this_char, ok := rdr.peekChar()
  137. if !ok {
  138. break
  139. }
  140. switch this_char {
  141. case ' ':
  142. b.bufCommit()
  143. rdr.tossUntilNeitherOf(" \t\n")
  144. case '\t':
  145. b.bufCommit()
  146. rdr.tossUntilNeitherOf(" \t\n")
  147. case '\n':
  148. b.bufCommit()
  149. rdr.tossUntilNeitherOf(" \t\n")
  150. case '\'':
  151. rdr.tossChar() // first `'`
  152. new_chars, found := rdr.takeUntil("'")
  153. if !found {
  154. return tokenizeResult{code: tokenizeResultCodeMissingEndSingleQuote, err_loc: rdr.pos - 1}
  155. }
  156. b.bufAppend(new_chars)
  157. rdr.tossChar() // the second `'`
  158. case '"':
  159. rdr.tossChar() // first `"`
  160. new_chars, found := rdr.takeUntil("\"")
  161. if !found {
  162. return tokenizeResult{code: tokenizeResultCodeMissingEndDoubleQuote, err_loc: rdr.pos - 1}
  163. }
  164. b.bufAppend(new_chars)
  165. rdr.tossChar() // the second `"`
  166. case '\\':
  167. rdr.tossChar()
  168. if rdr.done() {
  169. return tokenizeResult{code: tokenizeResultCodeMissingEscapedCharacter, err_loc: rdr.pos - 1}
  170. }
  171. new_chars, _ := rdr.takeChar()
  172. b.bufAppend(new_chars)
  173. default:
  174. b.bufAppendChar(this_char)
  175. rdr.tossChar()
  176. }
  177. }
  178. b.bufCommit()
  179. return tokenizeResult{tokens: b.tokens}
  180. }
  181. func Tokenize(str string) ([]string, error) {
  182. res := tokenize(str)
  183. switch res.code {
  184. case tokenizeResultCodeOk:
  185. return res.tokens, nil
  186. case tokenizeResultCodeMissingEndSingleQuote:
  187. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndSingleQuote, loc: res.err_loc}
  188. case tokenizeResultCodeMissingEndDoubleQuote:
  189. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndDoubleQuote, loc: res.err_loc}
  190. case tokenizeResultCodeMissingEscapedCharacter:
  191. return nil, TokenizeError{code: TokenizeErrorCodeMissingEscapedCharacter, loc: res.err_loc}
  192. default:
  193. return nil, TokenizeError{code: TokenizeErrorCodeGeneral, loc: res.err_loc}
  194. }
  195. }
  196. type TokenizeError struct {
  197. code TokenizeErrorCode
  198. loc uint
  199. }
  200. type TokenizeErrorCode uint8
  201. const (
  202. TokenizeErrorCodeGeneral TokenizeErrorCode = iota
  203. TokenizeErrorCodeMissingEndSingleQuote
  204. TokenizeErrorCodeMissingEndDoubleQuote
  205. TokenizeErrorCodeMissingEscapedCharacter
  206. )
  207. func (e TokenizeError) Error() string {
  208. switch e.code {
  209. case TokenizeErrorCodeMissingEndSingleQuote:
  210. return fmt.Sprintf("unterminated single-quote: at %d", e.loc)
  211. case TokenizeErrorCodeMissingEndDoubleQuote:
  212. return fmt.Sprintf("unterminated double-quote: at %d", e.loc)
  213. case TokenizeErrorCodeMissingEscapedCharacter:
  214. return fmt.Sprintf("missing escaped character: at %d", e.loc)
  215. default:
  216. return fmt.Sprintf("unknown TokenizeError code: .code=%d .loc=%d", e.code, e.loc)
  217. }
  218. }