1 /*
2 * burl - buffer URL normalization
3 *
4 * Copyright(c) 2018 Glenn Strauss gstrauss()gluelogic.com All rights reserved
5 * License: BSD 3-clause (same as lighttpd)
6 */
7 #include "first.h"
8 #include "burl.h"
9
10 #include <string.h>
11
12 #include "buffer.h"
13 #include "base64.h"
14
15 static const char hex_chars_uc[] = "0123456789ABCDEF";
16
17 /* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
18 static const char encoded_chars_http_uri_reqd[] = {
19 /*
20 0 1 2 3 4 5 6 7 8 9 A B C D E F
21 */
22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00 - 0F control chars */
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10 - 1F */
24 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2F space " # % */
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 30 - 3F < > */
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 50 - 5F [ \ ] ^ */
28 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6F ` */
29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, /* 70 - 7F { | } DEL */
30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 - 8F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 90 - 9F */
32 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* A0 - AF */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* B0 - BF */
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C0 - CF */
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* D0 - DF */
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* E0 - EF */
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* F0 - FF */
38 };
39
40
41 /* c (char) and n (nibble) MUST be unsigned integer types */
42 #define li_cton(c,n) \
43 (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))
44
45 /* b (byte) MUST be unsigned integer type
46 * https://en.wikipedia.org/wiki/UTF-8
47 * reject overlong encodings of 7-byte ASCII and invalid UTF-8
48 * (but does not detect other overlong multi-byte encodings) */
49 #define li_utf8_invalid_byte(b) ((b) >= 0xF5 || ((b)|0x1) == 0xC1)
50
51
burl_is_unreserved(const int c)52 static int burl_is_unreserved (const int c)
53 {
54 return (light_isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~');
55 }
56
57
burl_normalize_basic_unreserved_fix(buffer * b,buffer * t,int i,int qs)58 static int burl_normalize_basic_unreserved_fix (buffer *b, buffer *t, int i, int qs)
59 {
60 int j = i;
61 const int used = (int)buffer_clen(b);
62 const unsigned char * const s = (unsigned char *)b->ptr;
63 unsigned char * const p =
64 (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
65 unsigned int n1, n2;
66 memcpy(p, s, (size_t)i);
67 for (; i < used; ++i, ++j) {
68 if (!encoded_chars_http_uri_reqd[s[i]]) {
69 p[j] = s[i];
70 if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = j;
71 }
72 else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
73 const unsigned int x = (n1 << 4) | n2;
74 if (burl_is_unreserved(x)) {
75 p[j] = x;
76 }
77 else {
78 p[j] = '%';
79 p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
80 p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
81 if (li_utf8_invalid_byte(x)) qs = -2;
82 }
83 i+=2;
84 }
85 else if (s[i] == '#') break; /* ignore fragment */
86 else {
87 p[j] = '%';
88 p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
89 p[++j] = hex_chars_uc[s[i] & 0xF];
90 if (li_utf8_invalid_byte(s[i])) qs = -2;
91 }
92 }
93 buffer_copy_string_len(b, (char *)p, (size_t)j);
94 return qs;
95 }
96
97
burl_normalize_basic_unreserved(buffer * b,buffer * t)98 static int burl_normalize_basic_unreserved (buffer *b, buffer *t)
99 {
100 const unsigned char * const s = (unsigned char *)b->ptr;
101 const int used = (int)buffer_clen(b);
102 unsigned int n1, n2, x;
103 int qs = -1;
104
105 for (int i = 0; i < used; ++i) {
106 if (!encoded_chars_http_uri_reqd[s[i]]) {
107 if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = i;
108 }
109 else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
110 && !burl_is_unreserved((x = (n1 << 4) | n2))) {
111 if (li_utf8_invalid_byte(x)) qs = -2;
112 if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
113 if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
114 i+=2;
115 }
116 else if (s[i] == '#') { /* ignore fragment */
117 buffer_truncate(b, (size_t)i);
118 break;
119 }
120 else {
121 qs = burl_normalize_basic_unreserved_fix(b, t, i, qs);
122 break;
123 }
124 }
125
126 return qs;
127 }
128
129
burl_normalize_basic_required_fix(buffer * b,buffer * t,int i,int qs)130 static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int qs)
131 {
132 int j = i;
133 const int used = (int)buffer_clen(b);
134 const unsigned char * const s = (unsigned char *)b->ptr;
135 unsigned char * const p =
136 (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
137 unsigned int n1, n2;
138 int invalid_utf8 = 0;
139 memcpy(p, s, (size_t)i);
140 for (; i < used; ++i, ++j) {
141 if (!encoded_chars_http_uri_reqd[s[i]]) {
142 p[j] = s[i];
143 if (__builtin_expect( (s[i] == '?'), 0)) qs = j;
144 }
145 else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
146 const unsigned int x = (n1 << 4) | n2;
147 if (!encoded_chars_http_uri_reqd[x]
148 && (qs < 0
149 ? (x != '/' && x != '?')
150 : (x != '&' && x != '=' && x != ';' && x != '+'))) {
151 p[j] = x;
152 }
153 else {
154 p[j] = '%';
155 p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
156 p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
157 invalid_utf8 |= li_utf8_invalid_byte(x);
158 }
159 i+=2;
160 }
161 else if (s[i] == '#') break; /* ignore fragment */
162 else {
163 p[j] = '%';
164 p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
165 p[++j] = hex_chars_uc[s[i] & 0xF];
166 invalid_utf8 |= li_utf8_invalid_byte(s[i]);
167 }
168 }
169 buffer_copy_string_len(b, (char *)p, (size_t)j);
170 return !invalid_utf8 ? qs : -2;
171 }
172
173
burl_normalize_basic_required(buffer * b,buffer * t)174 static int burl_normalize_basic_required (buffer *b, buffer *t)
175 {
176 const unsigned char * const s = (unsigned char *)b->ptr;
177 const int used = (int)buffer_clen(b);
178 unsigned int n1, n2, x;
179 int qs = -1;
180 int invalid_utf8 = 0;
181
182 for (int i = 0; i < used; ++i) {
183 if (!encoded_chars_http_uri_reqd[s[i]]) {
184 if (s[i] == '?') qs = i;
185 }
186 else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
187 && (encoded_chars_http_uri_reqd[(x = (n1 << 4) | n2)]
188 || (qs < 0
189 ? (x == '/' || x == '?')
190 : (x == '&' || x == '=' || x == ';' || x == '+')))) {
191 invalid_utf8 |= li_utf8_invalid_byte(x);
192 if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
193 if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
194 i+=2;
195 }
196 else if (s[i] == '#') { /* ignore fragment */
197 buffer_truncate(b, (size_t)i);
198 break;
199 }
200 else {
201 qs = burl_normalize_basic_required_fix(b, t, i, qs);
202 break;
203 }
204 }
205
206 return !invalid_utf8 ? qs : -2;
207 }
208
209
burl_contains_ctrls(const buffer * b)210 static int burl_contains_ctrls (const buffer *b)
211 {
212 const char * const s = b->ptr;
213 const int used = (int)buffer_clen(b);
214 for (int i = 0; i < used; ++i) {
215 if (s[i] == '%' && (s[i+1] < '2' || (s[i+1] == '7' && s[i+2] == 'F')))
216 return 1;
217 }
218 return 0;
219 }
220
221
burl_normalize_qs20_to_plus_fix(buffer * b,int i)222 static void burl_normalize_qs20_to_plus_fix (buffer *b, int i)
223 {
224 char * const s = b->ptr;
225 const int used = (int)buffer_clen(b);
226 int j = i;
227 for (; i < used; ++i, ++j) {
228 s[j] = s[i];
229 if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') {
230 s[j] = '+';
231 i+=2;
232 }
233 }
234 buffer_truncate(b, j);
235 }
236
237
burl_normalize_qs20_to_plus(buffer * b,int qs)238 static void burl_normalize_qs20_to_plus (buffer *b, int qs)
239 {
240 const char * const s = b->ptr;
241 const int used = qs < 0 ? 0 : (int)buffer_clen(b);
242 int i;
243 if (qs < 0) return;
244 for (i = qs+1; i < used; ++i) {
245 if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') break;
246 }
247 if (i != used) burl_normalize_qs20_to_plus_fix(b, i);
248 }
249
250
burl_normalize_2F_to_slash_fix(buffer * b,int qs,int i)251 static int burl_normalize_2F_to_slash_fix (buffer *b, int qs, int i)
252 {
253 char * const s = b->ptr;
254 const int blen = (int)buffer_clen(b);
255 const int used = qs < 0 ? blen : qs;
256 int j = i;
257 for (; i < used; ++i, ++j) {
258 s[j] = s[i];
259 if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
260 s[j] = '/';
261 i+=2;
262 }
263 }
264 if (qs >= 0) {
265 const int qslen = blen - qs;
266 memmove(s+j, s+qs, (size_t)qslen);
267 qs = j;
268 j += qslen;
269 }
270 buffer_truncate(b, j);
271 return qs;
272 }
273
274
burl_normalize_2F_to_slash(buffer * b,int qs,int flags)275 static int burl_normalize_2F_to_slash (buffer *b, int qs, int flags)
276 {
277 /*("%2F" must already have been uppercased during normalization)*/
278 const char * const s = b->ptr;
279 const int used = qs < 0 ? (int)buffer_clen(b) : qs;
280 for (int i = 0; i < used; ++i) {
281 if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
282 return (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE)
283 ? burl_normalize_2F_to_slash_fix(b, qs, i)
284 : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/
285 }
286 }
287 return qs;
288 }
289
290
burl_normalize_path(buffer * b,buffer * t,int qs,int flags)291 static int burl_normalize_path (buffer *b, buffer *t, int qs, int flags)
292 {
293 const unsigned char * const s = (unsigned char *)b->ptr;
294 const int used = (int)buffer_clen(b);
295 int path_simplify = 0;
296 for (int i = 0, len = qs < 0 ? used : qs; i < len; ++i) {
297 if (s[i] == '.' && (s[i+1] != '.' || ++i)
298 && (s[i+1] == '/' || s[i+1] == '?' || s[i+1] == '\0')) {
299 path_simplify = 1;
300 break;
301 }
302 while (i < len && s[i] != '/') ++i;
303 if (s[i] == '/' && s[i+1] == '/') { /*(s[len] != '/')*/
304 path_simplify = 1;
305 break;
306 }
307 }
308
309 if (path_simplify) {
310 if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT) return -2;
311 if (qs >= 0) {
312 buffer_copy_string_len(t, b->ptr+qs, used - qs);
313 buffer_truncate(b, qs);
314 }
315
316 buffer_path_simplify(b);
317
318 if (qs >= 0) {
319 qs = (int)buffer_clen(b);
320 buffer_append_string_len(b, BUF_PTR_LEN(t));
321 }
322 }
323
324 return qs;
325 }
326
327
328 __attribute_cold__
329 __attribute_noinline__
330 __attribute_pure__
burl_scan_qmark(const buffer * const b)331 static int burl_scan_qmark (const buffer * const b) {
332 const char * const qmark = strchr(b->ptr, '?');
333 return qmark ? (int)(qmark - b->ptr) : -1;
334 }
335
336
burl_normalize(buffer * b,buffer * t,int flags)337 int burl_normalize (buffer *b, buffer *t, int flags)
338 {
339 int qs;
340
341 #if defined(__WIN32) || defined(__CYGWIN__)
342 /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
343 * convert to '/' for consistency before percent-encoding
344 * normalization which will convert '\\' to "%5C" in the URL.
345 * (Clients still should not be sending '\\' unencoded in requests.) */
346 if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS) {
347 for (char *p = b->ptr; *p != '?' && *p != '\0'; ++p) {
348 if (*p == '\\') *p = '/';
349 }
350 }
351 #endif
352
353 qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED)
354 ? burl_normalize_basic_required(b, t)
355 : burl_normalize_basic_unreserved(b, t);
356 if (-2 == qs) {
357 if (flags & HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT) return -2;
358 qs = burl_scan_qmark(b);
359 }
360
361 if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) {
362 if (burl_contains_ctrls(b)) return -2;
363 }
364
365 if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
366 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) {
367 qs = burl_normalize_2F_to_slash(b, qs, flags);
368 if (-2 == qs) return -2;
369 }
370
371 if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
372 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT)) {
373 qs = burl_normalize_path(b, t, qs, flags);
374 if (-2 == qs) return -2;
375 }
376
377 if (flags & HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS) {
378 if (qs >= 0) burl_normalize_qs20_to_plus(b, qs);
379 }
380
381 return qs;
382 }
383
384
burl_append_encode_nde(buffer * const b,const char * const str,const size_t len)385 static void burl_append_encode_nde (buffer * const b, const char * const str, const size_t len)
386 {
387 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
388 * unless already percent-encoded (does not double-encode) */
389 /* Note: not checking for invalid UTF-8 */
390 char * const p = buffer_string_prepare_append(b, len*3);
391 unsigned int n1, n2;
392 int j = 0;
393 for (unsigned int i = 0; i < len; ++i, ++j) {
394 if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
395 const unsigned int x = (n1 << 4) | n2;
396 if (burl_is_unreserved((int)x)) {
397 p[j] = (char)x;
398 }
399 else { /* leave UTF-8, control chars, and required chars encoded */
400 p[j] = '%';
401 p[++j] = str[i+1];
402 p[++j] = str[i+2];
403 }
404 i+=2;
405 }
406 else if (burl_is_unreserved(str[i])) {
407 p[j] = str[i];
408 }
409 else {
410 p[j] = '%';
411 p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
412 p[++j] = hex_chars_uc[str[i] & 0xF];
413 }
414 }
415 buffer_commit(b, j);
416 }
417
418
burl_append_encode_psnde(buffer * const b,const char * const str,const size_t len)419 static void burl_append_encode_psnde (buffer * const b, const char * const str, const size_t len)
420 {
421 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~ plus /
422 * unless already percent-encoded (does not double-encode) */
423 /* Note: not checking for invalid UTF-8 */
424 char * const p = buffer_string_prepare_append(b, len*3);
425 unsigned int n1, n2;
426 int j = 0;
427 for (unsigned int i = 0; i < len; ++i, ++j) {
428 if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
429 const unsigned int x = (n1 << 4) | n2;
430 if (burl_is_unreserved((int)x)) {
431 p[j] = (char)x;
432 }
433 else { /* leave UTF-8, control chars, and required chars encoded */
434 p[j] = '%';
435 p[++j] = str[i+1];
436 p[++j] = str[i+2];
437 }
438 i+=2;
439 }
440 else if (burl_is_unreserved(str[i]) || str[i] == '/') {
441 p[j] = str[i];
442 }
443 else {
444 p[j] = '%';
445 p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
446 p[++j] = hex_chars_uc[str[i] & 0xF];
447 }
448 }
449 buffer_commit(b, j);
450 }
451
452
burl_append_encode_all(buffer * const b,const char * const str,const size_t len)453 static void burl_append_encode_all (buffer * const b, const char * const str, const size_t len)
454 {
455 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
456 * Note: double-encodes any existing '%') */
457 /* Note: not checking for invalid UTF-8 */
458 char * const p = buffer_string_prepare_append(b, len*3);
459 int j = 0;
460 for (unsigned int i = 0; i < len; ++i, ++j) {
461 if (burl_is_unreserved(str[i])) {
462 p[j] = str[i];
463 }
464 else {
465 p[j] = '%';
466 p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
467 p[++j] = hex_chars_uc[str[i] & 0xF];
468 }
469 }
470 buffer_commit(b, j);
471 }
472
473
burl_offset_tolower(buffer * const b,const size_t off)474 static void burl_offset_tolower (buffer * const b, const size_t off)
475 {
476 /*(skips over all percent-encodings, including encoding of alpha chars)*/
477 for (char *p = b->ptr+off; p[0]; ++p) {
478 if (light_isupper(p[0])) p[0] |= 0x20;
479 else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
480 p+=2;
481 }
482 }
483
484
burl_offset_toupper(buffer * const b,const size_t off)485 static void burl_offset_toupper (buffer * const b, const size_t off)
486 {
487 /*(skips over all percent-encodings, including encoding of alpha chars)*/
488 for (char *p = b->ptr+off; p[0]; ++p) {
489 if (light_islower(p[0])) p[0] &= 0xdf;
490 else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
491 p+=2;
492 }
493 }
494
495
burl_append(buffer * const b,const char * const str,const size_t len,const int flags)496 void burl_append (buffer * const b, const char * const str, const size_t len, const int flags)
497 {
498 size_t off = 0;
499
500 if (0 == len) return;
501
502 if (0 == flags) {
503 buffer_append_string_len(b, str, len);
504 return;
505 }
506
507 if (flags & (BURL_TOUPPER|BURL_TOLOWER)) off = buffer_clen(b);
508
509 if (flags & BURL_ENCODE_NONE) {
510 buffer_append_string_len(b, str, len);
511 }
512 else if (flags & BURL_ENCODE_ALL) {
513 burl_append_encode_all(b, str, len);
514 }
515 else if (flags & BURL_ENCODE_NDE) {
516 burl_append_encode_nde(b, str, len);
517 }
518 else if (flags & BURL_ENCODE_PSNDE) {
519 burl_append_encode_psnde(b, str, len);
520 }
521 else if (flags & BURL_ENCODE_B64U) {
522 const unsigned char *s = (const unsigned char *)str;
523 buffer_append_base64_encode_no_padding(b, s, len, BASE64_URL);
524 }
525 else if (flags & BURL_DECODE_B64U) {
526 buffer_append_base64_decode(b, str, len, BASE64_URL);
527 }
528
529 /* note: not normalizing str, which could come from arbitrary header,
530 * so it is possible that alpha chars are percent-encoded upper/lowercase */
531 if (flags & (BURL_TOLOWER|BURL_TOUPPER)) {
532 (flags & BURL_TOLOWER)
533 ? burl_offset_tolower(b, off) /*(flags & BURL_TOLOWER)*/
534 : burl_offset_toupper(b, off); /*(flags & BURL_TOUPPER)*/
535 }
536 }
537