Selaa lähdekoodia

Implement #GU3

Alois Mahdal 2 päivää sitten
vanhempi
commit
5bf828f434
2 muutettua tiedostoa jossa 147 lisäystä ja 6 poistoa
  1. 83
    5
      app/tokenize/tokenize.go
  2. 64
    1
      app/tokenize/tokenize_test.go

+ 83
- 5
app/tokenize/tokenize.go Näytä tiedosto

@@ -47,6 +47,27 @@ func (self *reader) takeUntil(needle string) (string, bool) {
47 47
 	return out, true
48 48
 }
49 49
 
50
+func (self *reader) takeUntilAnyOf(needles string) (string, bool) {
51
+	self.checkPos()
52
+	to_take := uint(0)
53
+	found := false
54
+	for {
55
+		idx := self.pos + to_take
56
+		if idx == self.len() {
57
+			break
58
+		}
59
+		if hasByte(self.data[idx], needles) {
60
+			break
61
+		}
62
+		to_take += 1
63
+		found = true
64
+	}
65
+	end := self.pos + to_take
66
+	out := self.data[self.pos:end]
67
+	self.pos = end
68
+	return out, found
69
+}
70
+
50 71
 func (self *reader) tossUntilNeitherOf(needles string) bool {
51 72
 	self.checkPos()
52 73
 	tossed := uint(0)
@@ -79,6 +100,16 @@ func (self *reader) tossChar() {
79 100
 	self.pos += 1
80 101
 }
81 102
 
103
+func (self *reader) tossChars(n uint) bool {
104
+	self.checkPos()
105
+	end := self.pos + n
106
+	if end > self.len() {
107
+		return false
108
+	}
109
+	self.pos = end
110
+	return true
111
+}
112
+
82 113
 func (self *reader) done() bool {
83 114
 	self.checkPos()
84 115
 	return self.pos == self.len()
@@ -150,6 +181,45 @@ const (
150 181
 	tokenizeResultCodeMissingEscapedCharacter
151 182
 )
152 183
 
184
+func tokenize_dq(str string) (string, tokenizeResultCode, uint) {
185
+	buf := ""
186
+	rdr := reader{data: str}
187
+	for {
188
+		dq_char, ok := rdr.peekChar()
189
+		if !ok {
190
+			return "", tokenizeResultCodeMissingEndDoubleQuote, rdr.pos
191
+		}
192
+		switch dq_char {
193
+		case '\\':
194
+			rdr.tossChar()
195
+			next_char, ok := rdr.peekChar()
196
+			if !ok {
197
+				return "", tokenizeResultCodeMissingEscapedCharacter, rdr.pos - 1
198
+			}
199
+			switch next_char {
200
+			case '"':
201
+				rdr.tossChar()
202
+				buf = buf + string(next_char)
203
+			case '\\':
204
+				rdr.tossChar()
205
+				buf = buf + string(next_char)
206
+			default:
207
+				buf = buf + "\\"
208
+			}
209
+		case '"':
210
+			rdr.tossChar()
211
+			return buf, tokenizeResultCodeOk, rdr.pos
212
+		default:
213
+			dq_chars, found := rdr.takeUntilAnyOf("\\\"")
214
+			if !found {
215
+				return "", tokenizeResultCodeMissingEndDoubleQuote, rdr.pos
216
+			}
217
+			buf = buf + dq_chars
218
+		}
219
+	}
220
+
221
+}
222
+
153 223
 func tokenize(str string) tokenizeResult {
154 224
 	rdr := reader{data: str}
155 225
 	b := make_builder()
@@ -178,12 +248,20 @@ func tokenize(str string) tokenizeResult {
178 248
 			rdr.tossChar() // the second `'`
179 249
 		case '"':
180 250
 			rdr.tossChar() // first `"`
181
-			new_chars, found := rdr.takeUntil("\"")
182
-			if !found {
183
-				return tokenizeResult{code: tokenizeResultCodeMissingEndDoubleQuote, err_loc: rdr.pos - 1}
251
+			dq_chars, code, dq_rdr_taken := tokenize_dq(rdr.data[rdr.pos:])
252
+			switch code {
253
+			case tokenizeResultCodeOk:
254
+				ok := rdr.tossChars(dq_rdr_taken)
255
+				if !ok {
256
+					panic(fmt.Sprintf(
257
+						"invalid tokenize_dq() result: claims to have taken %d chars but there were only %d",
258
+						dq_rdr_taken, rdr.len()-rdr.pos,
259
+					))
260
+				}
261
+				b.bufAppend(dq_chars)
262
+			default:
263
+				return tokenizeResult{code: code, err_loc: dq_rdr_taken}
184 264
 			}
185
-			b.bufAppend(new_chars)
186
-			rdr.tossChar() // the second `"`
187 265
 		case '\\':
188 266
 			rdr.tossChar()
189 267
 			if rdr.done() {

+ 64
- 1
app/tokenize/tokenize_test.go Näytä tiedosto

@@ -39,7 +39,7 @@ func Test_tokenize(t *testing.T) {
39 39
 		{"fo\"\"o", tokenizeResult{tokens: []string{"foo"}}},
40 40
 		{"foo \"\" bar", tokenizeResult{tokens: []string{"foo", "", "bar"}}},
41 41
 		{"foo \"and\" bar", tokenizeResult{tokens: []string{"foo", "and", "bar"}}},
42
-		{"foo \"\\\t\n\" bar", tokenizeResult{tokens: []string{"foo", "\\\t\n", "bar"}}},
42
+		{"foo \"\\\\\t\n\" bar", tokenizeResult{tokens: []string{"foo", "\\\t\n", "bar"}}},
43 43
 		{"foo \" space bar \"", tokenizeResult{tokens: []string{"foo", " space bar "}}},
44 44
 		{"foo \"Joe's lunch\"", tokenizeResult{tokens: []string{"foo", "Joe's lunch"}}},
45 45
 
@@ -58,6 +58,19 @@ func Test_tokenize(t *testing.T) {
58 58
 		{"foo \\\\ bar", tokenizeResult{tokens: []string{"foo", "\\", "bar"}}},
59 59
 		{"foo \\'bar\\' baz", tokenizeResult{tokens: []string{"foo", "'bar'", "baz"}}},
60 60
 
61
+		// backslash within double quotes
62
+		{"\"\\", tokenizeResult{tokens: []string{}, code: tokenizeResultCodeMissingEscapedCharacter}},
63
+		{"\"\\\"", tokenizeResult{tokens: []string{}, code: tokenizeResultCodeMissingEndDoubleQuote, err_loc: 2}},
64
+		{"\"  \\\"  \"", tokenizeResult{tokens: []string{"  \"  "}}},
65
+		{"\"  \\\\  \"", tokenizeResult{tokens: []string{"  \\  "}}},
66
+		{"\"  \\x  \"", tokenizeResult{tokens: []string{"  \\x  "}}},
67
+		{"fo\"o\\\\b\"ar", tokenizeResult{tokens: []string{"foo\\bar"}}},
68
+		{"fo\"o \\\\ \"bar\" b\"az", tokenizeResult{tokens: []string{"foo \\ bar baz"}}},
69
+
70
+		// CC cases
71
+		{"cat \"/tmp/fox/'f 2'\" \"/tmp/fox/'f  \\73'\" \"/tmp/fox/'f \\21\\'\"",
72
+			tokenizeResult{tokens: []string{"cat", "/tmp/fox/'f 2'", "/tmp/fox/'f  \\73'", "/tmp/fox/'f \\21\\'"}},
73
+		},
61 74
 		//
62 75
 	}
63 76
 	for _, tc := range test_cases {
@@ -80,6 +93,9 @@ func Test_tokenize(t *testing.T) {
80 93
 					continue
81 94
 				}
82 95
 				t.Errorf("unexpected token in result .tokens[%d]: got %q, want %q in %v", i, have_result.tokens[i], tc.want_result.tokens[i], have_result)
96
+				t.Errorf(" .. test_str: ⟅%s⟆", tc.test_str)
97
+				t.Errorf(" .. got:      ⟅%s⟆", have_result.tokens[i])
98
+				t.Errorf(" .. want:     ⟅%s⟆", tc.want_result.tokens[i])
83 99
 				return
84 100
 			}
85 101
 		})
@@ -127,3 +143,50 @@ func Test_reader_tossUntilNeitherOf(t *testing.T) {
127 143
 		})
128 144
 	}
129 145
 }
146
+
147
+func Test_reader_takeUntilAnyOf(t *testing.T) {
148
+	var test_cases = []struct {
149
+		test_startpos uint
150
+		test_data     string
151
+		test_needles  string
152
+		want_endpos   uint
153
+		want_ok       bool
154
+	}{
155
+		{0, "", "", 0, false},
156
+		{0, "", "x", 0, false},
157
+		{0, "", "xy", 0, false},
158
+		{0, "x", "", 1, true},
159
+		{0, "x", "x", 0, false},
160
+		{0, "x", "xy", 0, false},
161
+		{0, "x", "yx", 0, false},
162
+		{0, "xa", "x", 0, false},
163
+		{0, "xa", "xy", 0, false},
164
+		{0, "xa", "yx", 0, false},
165
+		{0, "xya", "x", 0, false},
166
+		{0, "xya", "xy", 0, false},
167
+		{0, "xya", "yx", 0, false},
168
+		{0, "ax", "x", 1, true},
169
+		{0, "ax", "xy", 1, true},
170
+		{0, "ax", "yx", 1, true},
171
+		{0, "axy", "x", 1, true},
172
+		{0, "axy", "xy", 1, true},
173
+		{0, "axy", "yx", 1, true},
174
+		{0, "abxa", "x", 2, true},
175
+		{0, "aboa", "x", 4, true},
176
+	}
177
+	for _, tc := range test_cases {
178
+		t.Run(fmt.Sprintf("%q[%d:]-%q", tc.test_data, tc.test_startpos, tc.test_needles), func(t *testing.T) {
179
+			test_reader := reader{data: tc.test_data}
180
+			test_reader.pos = tc.test_startpos
181
+			_, have_ok := test_reader.takeUntilAnyOf(tc.test_needles)
182
+			if test_reader.pos != tc.want_endpos {
183
+				t.Errorf("unexpected position after take: got %d, want %d", test_reader.pos, tc.want_endpos)
184
+				return
185
+			}
186
+			if have_ok != tc.want_ok {
187
+				t.Errorf("unexpected ok: got %v, want %v", have_ok, tc.want_ok)
188
+				return
189
+			}
190
+		})
191
+	}
192
+}