xref: /lighttpd1.4/src/burl.c (revision a01e62bb)
1 /*
2  * burl - buffer URL normalization
3  *
4  * Copyright(c) 2018 Glenn Strauss gstrauss()gluelogic.com  All rights reserved
5  * License: BSD 3-clause (same as lighttpd)
6  */
7 #include "first.h"
8 #include "burl.h"
9 
10 #include <string.h>
11 
12 #include "buffer.h"
13 #include "base64.h"
14 
15 static const char hex_chars_uc[] = "0123456789ABCDEF";
16 
17 /* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
18 static const char encoded_chars_http_uri_reqd[] = {
19   /*
20   0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
21   */
22   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  00 -  0F control chars */
23   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  10 -  1F */
24   1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  20 -  2F space " # % */
25   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,  /*  30 -  3F < > */
26   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  40 -  4F */
27   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,  /*  50 -  5F [ \ ] ^ */
28   1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  60 -  6F ` */
29   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,  /*  70 -  7F { | } DEL */
30   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  80 -  8F */
31   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  90 -  9F */
32   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  A0 -  AF */
33   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  B0 -  BF */
34   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  C0 -  CF */
35   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  D0 -  DF */
36   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  E0 -  EF */
37   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  F0 -  FF */
38 };
39 
40 
41 /* c (char) and n (nibble) MUST be unsigned integer types */
42 #define li_cton(c,n) \
43   (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))
44 
45 /* b (byte) MUST be unsigned integer type
46  * https://en.wikipedia.org/wiki/UTF-8
47  * reject overlong encodings of 7-byte ASCII and invalid UTF-8
48  * (but does not detect other overlong multi-byte encodings) */
49 #define li_utf8_invalid_byte(b) ((b) >= 0xF5 || ((b)|0x1) == 0xC1)
50 
51 
burl_is_unreserved(const int c)52 static int burl_is_unreserved (const int c)
53 {
54     return (light_isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~');
55 }
56 
57 
burl_normalize_basic_unreserved_fix(buffer * b,buffer * t,int i,int qs)58 static int burl_normalize_basic_unreserved_fix (buffer *b, buffer *t, int i, int qs)
59 {
60     int j = i;
61     const int used = (int)buffer_clen(b);
62     const unsigned char * const s = (unsigned char *)b->ptr;
63     unsigned char * const p =
64       (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
65     unsigned int n1, n2;
66     memcpy(p, s, (size_t)i);
67     for (; i < used; ++i, ++j) {
68         if (!encoded_chars_http_uri_reqd[s[i]]) {
69             p[j] = s[i];
70             if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = j;
71         }
72         else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
73             const unsigned int x = (n1 << 4) | n2;
74             if (burl_is_unreserved(x)) {
75                 p[j] = x;
76             }
77             else {
78                 p[j]   = '%';
79                 p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
80                 p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
81                 if (li_utf8_invalid_byte(x)) qs = -2;
82             }
83             i+=2;
84         }
85         else if (s[i] == '#') break; /* ignore fragment */
86         else {
87             p[j]   = '%';
88             p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
89             p[++j] = hex_chars_uc[s[i] & 0xF];
90             if (li_utf8_invalid_byte(s[i])) qs = -2;
91         }
92     }
93     buffer_copy_string_len(b, (char *)p, (size_t)j);
94     return qs;
95 }
96 
97 
burl_normalize_basic_unreserved(buffer * b,buffer * t)98 static int burl_normalize_basic_unreserved (buffer *b, buffer *t)
99 {
100     const unsigned char * const s = (unsigned char *)b->ptr;
101     const int used = (int)buffer_clen(b);
102     unsigned int n1, n2, x;
103     int qs = -1;
104 
105     for (int i = 0; i < used; ++i) {
106         if (!encoded_chars_http_uri_reqd[s[i]]) {
107             if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = i;
108         }
109         else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
110                  && !burl_is_unreserved((x = (n1 << 4) | n2))) {
111             if (li_utf8_invalid_byte(x)) qs = -2;
112             if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
113             if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
114             i+=2;
115         }
116         else if (s[i] == '#') { /* ignore fragment */
117             buffer_truncate(b, (size_t)i);
118             break;
119         }
120         else {
121             qs = burl_normalize_basic_unreserved_fix(b, t, i, qs);
122             break;
123         }
124     }
125 
126     return qs;
127 }
128 
129 
burl_normalize_basic_required_fix(buffer * b,buffer * t,int i,int qs)130 static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int qs)
131 {
132     int j = i;
133     const int used = (int)buffer_clen(b);
134     const unsigned char * const s = (unsigned char *)b->ptr;
135     unsigned char * const p =
136       (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
137     unsigned int n1, n2;
138     int invalid_utf8 = 0;
139     memcpy(p, s, (size_t)i);
140     for (; i < used; ++i, ++j) {
141         if (!encoded_chars_http_uri_reqd[s[i]]) {
142             p[j] = s[i];
143             if (__builtin_expect( (s[i] == '?'), 0)) qs = j;
144         }
145         else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
146             const unsigned int x = (n1 << 4) | n2;
147             if (!encoded_chars_http_uri_reqd[x]
148                 && (qs < 0
149                     ? (x != '/' && x != '?')
150                     : (x != '&' && x != '=' && x != ';' && x != '+'))) {
151                 p[j] = x;
152             }
153             else {
154                 p[j]   = '%';
155                 p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
156                 p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
157                 invalid_utf8 |= li_utf8_invalid_byte(x);
158             }
159             i+=2;
160         }
161         else if (s[i] == '#') break; /* ignore fragment */
162         else {
163             p[j]   = '%';
164             p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
165             p[++j] = hex_chars_uc[s[i] & 0xF];
166             invalid_utf8 |= li_utf8_invalid_byte(s[i]);
167         }
168     }
169     buffer_copy_string_len(b, (char *)p, (size_t)j);
170     return !invalid_utf8 ? qs : -2;
171 }
172 
173 
burl_normalize_basic_required(buffer * b,buffer * t)174 static int burl_normalize_basic_required (buffer *b, buffer *t)
175 {
176     const unsigned char * const s = (unsigned char *)b->ptr;
177     const int used = (int)buffer_clen(b);
178     unsigned int n1, n2, x;
179     int qs = -1;
180     int invalid_utf8 = 0;
181 
182     for (int i = 0; i < used; ++i) {
183         if (!encoded_chars_http_uri_reqd[s[i]]) {
184             if (s[i] == '?') qs = i;
185         }
186         else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
187                  && (encoded_chars_http_uri_reqd[(x = (n1 << 4) | n2)]
188                      || (qs < 0
189                          ? (x == '/' || x == '?')
190                          : (x == '&' || x == '=' || x == ';' || x == '+')))) {
191             invalid_utf8 |= li_utf8_invalid_byte(x);
192             if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
193             if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
194             i+=2;
195         }
196         else if (s[i] == '#') { /* ignore fragment */
197             buffer_truncate(b, (size_t)i);
198             break;
199         }
200         else {
201             qs = burl_normalize_basic_required_fix(b, t, i, qs);
202             break;
203         }
204     }
205 
206     return !invalid_utf8 ? qs : -2;
207 }
208 
209 
burl_contains_ctrls(const buffer * b)210 static int burl_contains_ctrls (const buffer *b)
211 {
212     const char * const s = b->ptr;
213     const int used = (int)buffer_clen(b);
214     for (int i = 0; i < used; ++i) {
215         if (s[i] == '%' && (s[i+1] < '2' || (s[i+1] == '7' && s[i+2] == 'F')))
216             return 1;
217     }
218     return 0;
219 }
220 
221 
burl_normalize_qs20_to_plus_fix(buffer * b,int i)222 static void burl_normalize_qs20_to_plus_fix (buffer *b, int i)
223 {
224     char * const s = b->ptr;
225     const int used = (int)buffer_clen(b);
226     int j = i;
227     for (; i < used; ++i, ++j) {
228         s[j] = s[i];
229         if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') {
230             s[j] = '+';
231             i+=2;
232         }
233     }
234     buffer_truncate(b, j);
235 }
236 
237 
burl_normalize_qs20_to_plus(buffer * b,int qs)238 static void burl_normalize_qs20_to_plus (buffer *b, int qs)
239 {
240     const char * const s = b->ptr;
241     const int used = qs < 0 ? 0 : (int)buffer_clen(b);
242     int i;
243     if (qs < 0) return;
244     for (i = qs+1; i < used; ++i) {
245         if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') break;
246     }
247     if (i != used) burl_normalize_qs20_to_plus_fix(b, i);
248 }
249 
250 
burl_normalize_2F_to_slash_fix(buffer * b,int qs,int i)251 static int burl_normalize_2F_to_slash_fix (buffer *b, int qs, int i)
252 {
253     char * const s = b->ptr;
254     const int blen = (int)buffer_clen(b);
255     const int used = qs < 0 ? blen : qs;
256     int j = i;
257     for (; i < used; ++i, ++j) {
258         s[j] = s[i];
259         if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
260             s[j] = '/';
261             i+=2;
262         }
263     }
264     if (qs >= 0) {
265         const int qslen = blen - qs;
266         memmove(s+j, s+qs, (size_t)qslen);
267         qs = j;
268         j += qslen;
269     }
270     buffer_truncate(b, j);
271     return qs;
272 }
273 
274 
burl_normalize_2F_to_slash(buffer * b,int qs,int flags)275 static int burl_normalize_2F_to_slash (buffer *b, int qs, int flags)
276 {
277     /*("%2F" must already have been uppercased during normalization)*/
278     const char * const s = b->ptr;
279     const int used = qs < 0 ? (int)buffer_clen(b) : qs;
280     for (int i = 0; i < used; ++i) {
281         if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
282             return (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE)
283               ? burl_normalize_2F_to_slash_fix(b, qs, i)
284               : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/
285         }
286     }
287     return qs;
288 }
289 
290 
burl_normalize_path(buffer * b,buffer * t,int qs,int flags)291 static int burl_normalize_path (buffer *b, buffer *t, int qs, int flags)
292 {
293     const unsigned char * const s = (unsigned char *)b->ptr;
294     const int used = (int)buffer_clen(b);
295     int path_simplify = 0;
296     for (int i = 0, len = qs < 0 ? used : qs; i < len; ++i) {
297         if (s[i] == '.' && (s[i+1] != '.' || ++i)
298             && (s[i+1] == '/' || s[i+1] == '?' || s[i+1] == '\0')) {
299             path_simplify = 1;
300             break;
301         }
302         while (i < len && s[i] != '/') ++i;
303         if (s[i] == '/' && s[i+1] == '/') { /*(s[len] != '/')*/
304             path_simplify = 1;
305             break;
306         }
307     }
308 
309     if (path_simplify) {
310         if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT) return -2;
311         if (qs >= 0) {
312             buffer_copy_string_len(t, b->ptr+qs, used - qs);
313             buffer_truncate(b, qs);
314         }
315 
316         buffer_path_simplify(b);
317 
318         if (qs >= 0) {
319             qs = (int)buffer_clen(b);
320             buffer_append_string_len(b, BUF_PTR_LEN(t));
321         }
322     }
323 
324     return qs;
325 }
326 
327 
328 __attribute_cold__
329 __attribute_noinline__
330 __attribute_pure__
burl_scan_qmark(const buffer * const b)331 static int burl_scan_qmark (const buffer * const b) {
332     const char * const qmark = strchr(b->ptr, '?');
333     return qmark ? (int)(qmark - b->ptr) : -1;
334 }
335 
336 
burl_normalize(buffer * b,buffer * t,int flags)337 int burl_normalize (buffer *b, buffer *t, int flags)
338 {
339     int qs;
340 
341   #if defined(__WIN32) || defined(__CYGWIN__)
342     /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
343      * convert to '/' for consistency before percent-encoding
344      * normalization which will convert '\\' to "%5C" in the URL.
345      * (Clients still should not be sending '\\' unencoded in requests.) */
346     if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS) {
347         for (char *p = b->ptr; *p != '?' && *p != '\0'; ++p) {
348             if (*p == '\\') *p = '/';
349         }
350     }
351   #endif
352 
353     qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED)
354       ? burl_normalize_basic_required(b, t)
355       : burl_normalize_basic_unreserved(b, t);
356     if (-2 == qs) {
357         if (flags & HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT) return -2;
358         qs = burl_scan_qmark(b);
359     }
360 
361     if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) {
362         if (burl_contains_ctrls(b)) return -2;
363     }
364 
365     if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
366                 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) {
367         qs = burl_normalize_2F_to_slash(b, qs, flags);
368         if (-2 == qs) return -2;
369     }
370 
371     if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
372                 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT)) {
373         qs = burl_normalize_path(b, t, qs, flags);
374         if (-2 == qs) return -2;
375     }
376 
377     if (flags & HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS) {
378         if (qs >= 0) burl_normalize_qs20_to_plus(b, qs);
379     }
380 
381     return qs;
382 }
383 
384 
burl_append_encode_nde(buffer * const b,const char * const str,const size_t len)385 static void burl_append_encode_nde (buffer * const b, const char * const str, const size_t len)
386 {
387     /* percent-encodes everything except unreserved  - . 0-9 A-Z _ a-z ~
388      * unless already percent-encoded (does not double-encode) */
389     /* Note: not checking for invalid UTF-8 */
390     char * const p = buffer_string_prepare_append(b, len*3);
391     unsigned int n1, n2;
392     int j = 0;
393     for (unsigned int i = 0; i < len; ++i, ++j) {
394         if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
395             const unsigned int x = (n1 << 4) | n2;
396             if (burl_is_unreserved((int)x)) {
397                 p[j] = (char)x;
398             }
399             else { /* leave UTF-8, control chars, and required chars encoded */
400                 p[j]   = '%';
401                 p[++j] = str[i+1];
402                 p[++j] = str[i+2];
403             }
404             i+=2;
405         }
406         else if (burl_is_unreserved(str[i])) {
407             p[j] = str[i];
408         }
409         else {
410             p[j]   = '%';
411             p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
412             p[++j] = hex_chars_uc[str[i] & 0xF];
413         }
414     }
415     buffer_commit(b, j);
416 }
417 
418 
burl_append_encode_psnde(buffer * const b,const char * const str,const size_t len)419 static void burl_append_encode_psnde (buffer * const b, const char * const str, const size_t len)
420 {
421     /* percent-encodes everything except unreserved  - . 0-9 A-Z _ a-z ~ plus /
422      * unless already percent-encoded (does not double-encode) */
423     /* Note: not checking for invalid UTF-8 */
424     char * const p = buffer_string_prepare_append(b, len*3);
425     unsigned int n1, n2;
426     int j = 0;
427     for (unsigned int i = 0; i < len; ++i, ++j) {
428         if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
429             const unsigned int x = (n1 << 4) | n2;
430             if (burl_is_unreserved((int)x)) {
431                 p[j] = (char)x;
432             }
433             else { /* leave UTF-8, control chars, and required chars encoded */
434                 p[j]   = '%';
435                 p[++j] = str[i+1];
436                 p[++j] = str[i+2];
437             }
438             i+=2;
439         }
440         else if (burl_is_unreserved(str[i]) || str[i] == '/') {
441             p[j] = str[i];
442         }
443         else {
444             p[j]   = '%';
445             p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
446             p[++j] = hex_chars_uc[str[i] & 0xF];
447         }
448     }
449     buffer_commit(b, j);
450 }
451 
452 
burl_append_encode_all(buffer * const b,const char * const str,const size_t len)453 static void burl_append_encode_all (buffer * const b, const char * const str, const size_t len)
454 {
455     /* percent-encodes everything except unreserved  - . 0-9 A-Z _ a-z ~
456      * Note: double-encodes any existing '%') */
457     /* Note: not checking for invalid UTF-8 */
458     char * const p = buffer_string_prepare_append(b, len*3);
459     int j = 0;
460     for (unsigned int i = 0; i < len; ++i, ++j) {
461         if (burl_is_unreserved(str[i])) {
462             p[j] = str[i];
463         }
464         else {
465             p[j]   = '%';
466             p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
467             p[++j] = hex_chars_uc[str[i] & 0xF];
468         }
469     }
470     buffer_commit(b, j);
471 }
472 
473 
burl_offset_tolower(buffer * const b,const size_t off)474 static void burl_offset_tolower (buffer * const b, const size_t off)
475 {
476     /*(skips over all percent-encodings, including encoding of alpha chars)*/
477     for (char *p = b->ptr+off; p[0]; ++p) {
478         if (light_isupper(p[0])) p[0] |= 0x20;
479         else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
480             p+=2;
481     }
482 }
483 
484 
burl_offset_toupper(buffer * const b,const size_t off)485 static void burl_offset_toupper (buffer * const b, const size_t off)
486 {
487     /*(skips over all percent-encodings, including encoding of alpha chars)*/
488     for (char *p = b->ptr+off; p[0]; ++p) {
489         if (light_islower(p[0])) p[0] &= 0xdf;
490         else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
491             p+=2;
492     }
493 }
494 
495 
burl_append(buffer * const b,const char * const str,const size_t len,const int flags)496 void burl_append (buffer * const b, const char * const str, const size_t len, const int flags)
497 {
498     size_t off = 0;
499 
500     if (0 == len) return;
501 
502     if (0 == flags) {
503         buffer_append_string_len(b, str, len);
504         return;
505     }
506 
507     if (flags & (BURL_TOUPPER|BURL_TOLOWER)) off = buffer_clen(b);
508 
509     if (flags & BURL_ENCODE_NONE) {
510         buffer_append_string_len(b, str, len);
511     }
512     else if (flags & BURL_ENCODE_ALL) {
513         burl_append_encode_all(b, str, len);
514     }
515     else if (flags & BURL_ENCODE_NDE) {
516         burl_append_encode_nde(b, str, len);
517     }
518     else if (flags & BURL_ENCODE_PSNDE) {
519         burl_append_encode_psnde(b, str, len);
520     }
521     else if (flags & BURL_ENCODE_B64U) {
522         const unsigned char *s = (const unsigned char *)str;
523         buffer_append_base64_encode_no_padding(b, s, len, BASE64_URL);
524     }
525     else if (flags & BURL_DECODE_B64U) {
526         buffer_append_base64_decode(b, str, len, BASE64_URL);
527     }
528 
529     /* note: not normalizing str, which could come from arbitrary header,
530      * so it is possible that alpha chars are percent-encoded upper/lowercase */
531     if (flags & (BURL_TOLOWER|BURL_TOUPPER)) {
532         (flags & BURL_TOLOWER)
533           ? burl_offset_tolower(b, off)  /*(flags & BURL_TOLOWER)*/
534           : burl_offset_toupper(b, off); /*(flags & BURL_TOUPPER)*/
535     }
536 }
537