1*e4f25e4aSBram MoolenaarINIT
2*e4f25e4aSBram MoolenaarWANTENCODING
3*e4f25e4aSBram Moolenaar
4*e4f25e4aSBram Moolenaar!Low
5*e4f25e4aSBram MoolenaarENCIN "123"
6*e4f25e4aSBram Moolenaar  encout 0x31,0x32,0x33
7*e4f25e4aSBram Moolenaar
8*e4f25e4aSBram Moolenaar# We want to prove the UTF-8 parser correctly handles all the sequences.
9*e4f25e4aSBram Moolenaar# Easy way to do this is to check it does low/high boundary cases, as that
10*e4f25e4aSBram Moolenaar# leaves only two for each sequence length
11*e4f25e4aSBram Moolenaar#
12*e4f25e4aSBram Moolenaar# These ranges are therefore:
13*e4f25e4aSBram Moolenaar#
14*e4f25e4aSBram Moolenaar# Two bytes:
15*e4f25e4aSBram Moolenaar# U+0080 = 000 10000000 =>    00010   000000
16*e4f25e4aSBram Moolenaar#                       => 11000010 10000000 = C2 80
17*e4f25e4aSBram Moolenaar# U+07FF = 111 11111111 =>    11111   111111
18*e4f25e4aSBram Moolenaar#                       => 11011111 10111111 = DF BF
19*e4f25e4aSBram Moolenaar#
20*e4f25e4aSBram Moolenaar# Three bytes:
21*e4f25e4aSBram Moolenaar# U+0800 = 00001000 00000000 =>     0000   100000   000000
22*e4f25e4aSBram Moolenaar#                            => 11100000 10100000 10000000 = E0 A0 80
23*e4f25e4aSBram Moolenaar# U+FFFD = 11111111 11111101 =>     1111   111111   111101
24*e4f25e4aSBram Moolenaar#                            => 11101111 10111111 10111101 = EF BF BD
25*e4f25e4aSBram Moolenaar# (We avoid U+FFFE and U+FFFF as they're invalid codepoints)
26*e4f25e4aSBram Moolenaar#
27*e4f25e4aSBram Moolenaar# Four bytes:
28*e4f25e4aSBram Moolenaar# U+10000  = 00001 00000000 00000000 =>      000   010000   000000   000000
29*e4f25e4aSBram Moolenaar#                                    => 11110000 10010000 10000000 10000000 = F0 90 80 80
30*e4f25e4aSBram Moolenaar# U+1FFFFF = 11111 11111111 11111111 =>      111   111111   111111   111111
31*e4f25e4aSBram Moolenaar#                                    => 11110111 10111111 10111111 10111111 = F7 BF BF BF
32*e4f25e4aSBram Moolenaar
33*e4f25e4aSBram Moolenaar!2 byte
34*e4f25e4aSBram MoolenaarENCIN "\xC2\x80\xDF\xBF"
35*e4f25e4aSBram Moolenaar  encout 0x0080, 0x07FF
36*e4f25e4aSBram Moolenaar
37*e4f25e4aSBram Moolenaar!3 byte
38*e4f25e4aSBram MoolenaarENCIN "\xE0\xA0\x80\xEF\xBF\xBD"
39*e4f25e4aSBram Moolenaar  encout 0x0800,0xFFFD
40*e4f25e4aSBram Moolenaar
41*e4f25e4aSBram Moolenaar!4 byte
42*e4f25e4aSBram MoolenaarENCIN "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF"
43*e4f25e4aSBram Moolenaar  encout 0x10000,0x1fffff
44*e4f25e4aSBram Moolenaar
45*e4f25e4aSBram Moolenaar# Next up, we check some invalid sequences
46*e4f25e4aSBram Moolenaar#  + Early termination (back to low bytes too soon)
47*e4f25e4aSBram Moolenaar#  + Early restart (another sequence introduction before the previous one was finished)
48*e4f25e4aSBram Moolenaar
49*e4f25e4aSBram Moolenaar!Early termination
50*e4f25e4aSBram MoolenaarENCIN "\xC2!"
51*e4f25e4aSBram Moolenaar  encout 0xfffd,0x21
52*e4f25e4aSBram Moolenaar
53*e4f25e4aSBram MoolenaarENCIN "\xE0!\xE0\xA0!"
54*e4f25e4aSBram Moolenaar  encout 0xfffd,0x21,0xfffd,0x21
55*e4f25e4aSBram Moolenaar
56*e4f25e4aSBram MoolenaarENCIN "\xF0!\xF0\x90!\xF0\x90\x80!"
57*e4f25e4aSBram Moolenaar  encout 0xfffd,0x21,0xfffd,0x21,0xfffd,0x21
58*e4f25e4aSBram Moolenaar
59*e4f25e4aSBram Moolenaar!Early restart
60*e4f25e4aSBram MoolenaarENCIN "\xC2\xC2\x90"
61*e4f25e4aSBram Moolenaar  encout 0xfffd,0x0090
62*e4f25e4aSBram Moolenaar
63*e4f25e4aSBram MoolenaarENCIN "\xE0\xC2\x90\xE0\xA0\xC2\x90"
64*e4f25e4aSBram Moolenaar  encout 0xfffd,0x0090,0xfffd,0x0090
65*e4f25e4aSBram Moolenaar
66*e4f25e4aSBram MoolenaarENCIN "\xF0\xC2\x90\xF0\x90\xC2\x90\xF0\x90\x80\xC2\x90"
67*e4f25e4aSBram Moolenaar  encout 0xfffd,0x0090,0xfffd,0x0090,0xfffd,0x0090
68*e4f25e4aSBram Moolenaar
69*e4f25e4aSBram Moolenaar# Test the overlong sequences by giving an overlong encoding of U+0000 and
70*e4f25e4aSBram Moolenaar# an encoding of the highest codepoint still too short
71*e4f25e4aSBram Moolenaar#
72*e4f25e4aSBram Moolenaar# Two bytes:
73*e4f25e4aSBram Moolenaar# U+0000 = C0 80
74*e4f25e4aSBram Moolenaar# U+007F = 000 01111111 =>    00001   111111 =>
75*e4f25e4aSBram Moolenaar#                       => 11000001 10111111 => C1 BF
76*e4f25e4aSBram Moolenaar#
77*e4f25e4aSBram Moolenaar# Three bytes:
78*e4f25e4aSBram Moolenaar# U+0000 = E0 80 80
79*e4f25e4aSBram Moolenaar# U+07FF = 00000111 11111111 =>     0000   011111   111111
80*e4f25e4aSBram Moolenaar#                            => 11100000 10011111 10111111 = E0 9F BF
81*e4f25e4aSBram Moolenaar#
82*e4f25e4aSBram Moolenaar# Four bytes:
83*e4f25e4aSBram Moolenaar# U+0000 = F0 80 80 80
84*e4f25e4aSBram Moolenaar# U+FFFF = 11111111 11111111 =>      000   001111   111111   111111
85*e4f25e4aSBram Moolenaar#                            => 11110000 10001111 10111111 10111111 = F0 8F BF BF
86*e4f25e4aSBram Moolenaar
87*e4f25e4aSBram Moolenaar!Overlong
88*e4f25e4aSBram MoolenaarENCIN "\xC0\x80\xC1\xBF"
89*e4f25e4aSBram Moolenaar  encout 0xfffd,0xfffd
90*e4f25e4aSBram Moolenaar
91*e4f25e4aSBram MoolenaarENCIN "\xE0\x80\x80\xE0\x9F\xBF"
92*e4f25e4aSBram Moolenaar  encout 0xfffd,0xfffd
93*e4f25e4aSBram Moolenaar
94*e4f25e4aSBram MoolenaarENCIN "\xF0\x80\x80\x80\xF0\x8F\xBF\xBF"
95*e4f25e4aSBram Moolenaar  encout 0xfffd,0xfffd
96*e4f25e4aSBram Moolenaar
97*e4f25e4aSBram Moolenaar# UTF-16 surrogates U+D800 and U+DFFF
98*e4f25e4aSBram Moolenaar!UTF-16 Surrogates
99*e4f25e4aSBram MoolenaarENCIN "\xED\xA0\x80\xED\xBF\xBF"
100*e4f25e4aSBram Moolenaar  encout 0xfffd,0xfffd
101*e4f25e4aSBram Moolenaar
102*e4f25e4aSBram Moolenaar!Split write
103*e4f25e4aSBram MoolenaarENCIN "\xC2"
104*e4f25e4aSBram MoolenaarENCIN "\xA0"
105*e4f25e4aSBram Moolenaar  encout 0x000A0
106*e4f25e4aSBram Moolenaar
107*e4f25e4aSBram MoolenaarENCIN "\xE0"
108*e4f25e4aSBram MoolenaarENCIN "\xA0\x80"
109*e4f25e4aSBram Moolenaar  encout 0x00800
110*e4f25e4aSBram MoolenaarENCIN "\xE0\xA0"
111*e4f25e4aSBram MoolenaarENCIN "\x80"
112*e4f25e4aSBram Moolenaar  encout 0x00800
113*e4f25e4aSBram Moolenaar
114*e4f25e4aSBram MoolenaarENCIN "\xF0"
115*e4f25e4aSBram MoolenaarENCIN "\x90\x80\x80"
116*e4f25e4aSBram Moolenaar  encout 0x10000
117*e4f25e4aSBram MoolenaarENCIN "\xF0\x90"
118*e4f25e4aSBram MoolenaarENCIN "\x80\x80"
119*e4f25e4aSBram Moolenaar  encout 0x10000
120*e4f25e4aSBram MoolenaarENCIN "\xF0\x90\x80"
121*e4f25e4aSBram MoolenaarENCIN "\x80"
122*e4f25e4aSBram Moolenaar  encout 0x10000
123