123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. package tokenize
  2. import "fmt"
  3. import "strings"
  4. type reader struct {
  5. data string
  6. pos uint
  7. }
  8. func (self *reader) len() uint {
  9. return uint(len(self.data))
  10. }
  11. func (self *reader) checkPos() {
  12. if self.pos > uint(len(self.data)) {
  13. panic(fmt.Sprintf("invalid cursor state: pos=%d but len(data)=%d", self.pos, len(self.data)))
  14. }
  15. }
  16. func (self *reader) peekChar() (byte, bool) {
  17. self.checkPos()
  18. if self.done() {
  19. return 0, false
  20. }
  21. return self.data[self.pos], true
  22. }
  23. func (self *reader) takeChar() (string, bool) {
  24. self.checkPos()
  25. if self.pos == self.len() {
  26. return "", false
  27. }
  28. self.pos += 1
  29. return self.data[self.pos-1 : self.pos], true
  30. }
  31. func (self *reader) takeUntil(needle string) (string, bool) {
  32. self.checkPos()
  33. idx := strings.Index(self.data[self.pos:], needle)
  34. if idx == -1 {
  35. return "", false
  36. }
  37. end := self.pos + uint(idx)
  38. out := self.data[self.pos:end]
  39. self.pos = end
  40. return out, true
  41. }
  42. func (self *reader) takeUntilAnyOf(needles string) (string, bool) {
  43. self.checkPos()
  44. to_take := uint(0)
  45. found := false
  46. for {
  47. idx := self.pos + to_take
  48. if idx == self.len() {
  49. break
  50. }
  51. if hasByte(self.data[idx], needles) {
  52. break
  53. }
  54. to_take += 1
  55. found = true
  56. }
  57. end := self.pos + to_take
  58. out := self.data[self.pos:end]
  59. self.pos = end
  60. return out, found
  61. }
  62. func (self *reader) tossUntilNeitherOf(needles string) bool {
  63. self.checkPos()
  64. tossed := uint(0)
  65. for {
  66. idx := self.pos + tossed
  67. if idx == self.len() {
  68. break
  69. }
  70. if hasByte(self.data[idx], needles) {
  71. tossed += 1
  72. continue
  73. }
  74. break
  75. }
  76. self.pos += tossed
  77. return tossed > 0
  78. }
  79. func hasByte(b byte, ns string) bool {
  80. for i := range len(ns) {
  81. if b == ns[i] {
  82. return true
  83. }
  84. }
  85. return false
  86. }
  87. func (self *reader) tossChar() {
  88. self.checkPos()
  89. self.pos += 1
  90. }
  91. func (self *reader) tossChars(n uint) bool {
  92. self.checkPos()
  93. end := self.pos + n
  94. if end > self.len() {
  95. return false
  96. }
  97. self.pos = end
  98. return true
  99. }
  100. func (self *reader) done() bool {
  101. self.checkPos()
  102. return self.pos == self.len()
  103. }
  104. type builder struct {
  105. tokens []string
  106. buf_set bool
  107. buf_chars string
  108. }
  109. func make_builder() builder {
  110. return builder{tokens: make([]string, 0), buf_set: false, buf_chars: ""}
  111. }
  112. func (self *builder) bufAppend(chars string) {
  113. if self.buf_set {
  114. self.buf_chars += chars
  115. } else {
  116. self.buf_chars = chars
  117. self.buf_set = true
  118. }
  119. }
  120. func (self *builder) bufAppendChar(char byte) {
  121. if !self.buf_set {
  122. self.buf_chars = ""
  123. self.buf_set = true
  124. }
  125. self.buf_chars += string(char)
  126. }
  127. func (self *builder) bufCommit() {
  128. if !self.buf_set {
  129. return
  130. }
  131. self.tokens = append(self.tokens, self.buf_chars)
  132. self.buf_chars = ""
  133. self.buf_set = false
  134. }
  135. type tokenizeResult struct {
  136. tokens []string
  137. code tokenizeResultCode
  138. err_loc uint
  139. }
  140. type tokenizeResultCode uint8
  141. func (self tokenizeResultCode) String() string {
  142. switch self {
  143. case tokenizeResultCodeOk:
  144. return "tokenizeResultCodeOk"
  145. case tokenizeResultCodeMissingEndSingleQuote:
  146. return "tokenizeResultCodeMissingEndSingleQuote"
  147. case tokenizeResultCodeMissingEndDoubleQuote:
  148. return "tokenizeResultCodeMissingEndDoubleQuote"
  149. case tokenizeResultCodeMissingEscapedCharacter:
  150. return "tokenizeResultCodeMissingEscapedCharacter"
  151. default:
  152. return fmt.Sprintf("unknown!tokenizeResultCode(%d)", self)
  153. }
  154. }
  155. const (
  156. tokenizeResultCodeOk tokenizeResultCode = iota
  157. tokenizeResultCodeMissingEndSingleQuote
  158. tokenizeResultCodeMissingEndDoubleQuote
  159. tokenizeResultCodeMissingEscapedCharacter
  160. )
  161. func tokenize_dq(str string) (string, tokenizeResultCode, uint) {
  162. buf := ""
  163. rdr := reader{data: str}
  164. for {
  165. dq_char, ok := rdr.peekChar()
  166. if !ok {
  167. return "", tokenizeResultCodeMissingEndDoubleQuote, rdr.pos
  168. }
  169. switch dq_char {
  170. case '\\':
  171. rdr.tossChar()
  172. next_char, ok := rdr.peekChar()
  173. if !ok {
  174. return "", tokenizeResultCodeMissingEscapedCharacter, rdr.pos - 1
  175. }
  176. switch next_char {
  177. case '"':
  178. rdr.tossChar()
  179. buf = buf + string(next_char)
  180. case '\\':
  181. rdr.tossChar()
  182. buf = buf + string(next_char)
  183. default:
  184. buf = buf + "\\"
  185. }
  186. case '"':
  187. rdr.tossChar()
  188. return buf, tokenizeResultCodeOk, rdr.pos
  189. default:
  190. dq_chars, found := rdr.takeUntilAnyOf("\\\"")
  191. if !found {
  192. return "", tokenizeResultCodeMissingEndDoubleQuote, rdr.pos
  193. }
  194. buf = buf + dq_chars
  195. }
  196. }
  197. }
  198. func tokenize(str string) tokenizeResult {
  199. rdr := reader{data: str}
  200. b := make_builder()
  201. for {
  202. this_char, ok := rdr.peekChar()
  203. if !ok {
  204. break
  205. }
  206. switch this_char {
  207. case ' ':
  208. b.bufCommit()
  209. rdr.tossUntilNeitherOf(" \t\n")
  210. case '\t':
  211. b.bufCommit()
  212. rdr.tossUntilNeitherOf(" \t\n")
  213. case '\n':
  214. b.bufCommit()
  215. rdr.tossUntilNeitherOf(" \t\n")
  216. case '\'':
  217. rdr.tossChar() // first `'`
  218. new_chars, found := rdr.takeUntil("'")
  219. if !found {
  220. return tokenizeResult{code: tokenizeResultCodeMissingEndSingleQuote, err_loc: rdr.pos - 1}
  221. }
  222. b.bufAppend(new_chars)
  223. rdr.tossChar() // the second `'`
  224. case '"':
  225. rdr.tossChar() // first `"`
  226. dq_chars, code, dq_rdr_taken := tokenize_dq(rdr.data[rdr.pos:])
  227. switch code {
  228. case tokenizeResultCodeOk:
  229. ok := rdr.tossChars(dq_rdr_taken)
  230. if !ok {
  231. panic(fmt.Sprintf(
  232. "invalid tokenize_dq() result: claims to have taken %d chars but there were only %d",
  233. dq_rdr_taken, rdr.len()-rdr.pos,
  234. ))
  235. }
  236. b.bufAppend(dq_chars)
  237. default:
  238. return tokenizeResult{code: code, err_loc: dq_rdr_taken}
  239. }
  240. case '\\':
  241. rdr.tossChar()
  242. if rdr.done() {
  243. return tokenizeResult{code: tokenizeResultCodeMissingEscapedCharacter, err_loc: rdr.pos - 1}
  244. }
  245. new_chars, _ := rdr.takeChar()
  246. b.bufAppend(new_chars)
  247. default:
  248. b.bufAppendChar(this_char)
  249. rdr.tossChar()
  250. }
  251. }
  252. b.bufCommit()
  253. return tokenizeResult{tokens: b.tokens}
  254. }
  255. func Tokenize(str string) ([]string, error) {
  256. res := tokenize(str)
  257. switch res.code {
  258. case tokenizeResultCodeOk:
  259. return res.tokens, nil
  260. case tokenizeResultCodeMissingEndSingleQuote:
  261. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndSingleQuote, loc: res.err_loc}
  262. case tokenizeResultCodeMissingEndDoubleQuote:
  263. return nil, TokenizeError{code: TokenizeErrorCodeMissingEndDoubleQuote, loc: res.err_loc}
  264. case tokenizeResultCodeMissingEscapedCharacter:
  265. return nil, TokenizeError{code: TokenizeErrorCodeMissingEscapedCharacter, loc: res.err_loc}
  266. default:
  267. return nil, TokenizeError{code: TokenizeErrorCodeGeneral, loc: res.err_loc}
  268. }
  269. }
  270. type TokenizeError struct {
  271. code TokenizeErrorCode
  272. loc uint
  273. }
  274. type TokenizeErrorCode uint8
  275. const (
  276. TokenizeErrorCodeGeneral TokenizeErrorCode = iota
  277. TokenizeErrorCodeMissingEndSingleQuote
  278. TokenizeErrorCodeMissingEndDoubleQuote
  279. TokenizeErrorCodeMissingEscapedCharacter
  280. )
  281. func (e TokenizeError) Error() string {
  282. switch e.code {
  283. case TokenizeErrorCodeMissingEndSingleQuote:
  284. return fmt.Sprintf("unterminated single-quote: at %d", e.loc)
  285. case TokenizeErrorCodeMissingEndDoubleQuote:
  286. return fmt.Sprintf("unterminated double-quote: at %d", e.loc)
  287. case TokenizeErrorCodeMissingEscapedCharacter:
  288. return fmt.Sprintf("missing escaped character: at %d", e.loc)
  289. default:
  290. return fmt.Sprintf("unknown TokenizeError code: .code=%d .loc=%d", e.code, e.loc)
  291. }
  292. }