1*e4f25e4aSBram MoolenaarINIT 2*e4f25e4aSBram MoolenaarWANTENCODING 3*e4f25e4aSBram Moolenaar 4*e4f25e4aSBram Moolenaar!Low 5*e4f25e4aSBram MoolenaarENCIN "123" 6*e4f25e4aSBram Moolenaar encout 0x31,0x32,0x33 7*e4f25e4aSBram Moolenaar 8*e4f25e4aSBram Moolenaar# We want to prove the UTF-8 parser correctly handles all the sequences. 9*e4f25e4aSBram Moolenaar# Easy way to do this is to check it does low/high boundary cases, as that 10*e4f25e4aSBram Moolenaar# leaves only two for each sequence length 11*e4f25e4aSBram Moolenaar# 12*e4f25e4aSBram Moolenaar# These ranges are therefore: 13*e4f25e4aSBram Moolenaar# 14*e4f25e4aSBram Moolenaar# Two bytes: 15*e4f25e4aSBram Moolenaar# U+0080 = 000 10000000 => 00010 000000 16*e4f25e4aSBram Moolenaar# => 11000010 10000000 = C2 80 17*e4f25e4aSBram Moolenaar# U+07FF = 111 11111111 => 11111 111111 18*e4f25e4aSBram Moolenaar# => 11011111 10111111 = DF BF 19*e4f25e4aSBram Moolenaar# 20*e4f25e4aSBram Moolenaar# Three bytes: 21*e4f25e4aSBram Moolenaar# U+0800 = 00001000 00000000 => 0000 100000 000000 22*e4f25e4aSBram Moolenaar# => 11100000 10100000 10000000 = E0 A0 80 23*e4f25e4aSBram Moolenaar# U+FFFD = 11111111 11111101 => 1111 111111 111101 24*e4f25e4aSBram Moolenaar# => 11101111 10111111 10111101 = EF BF BD 25*e4f25e4aSBram Moolenaar# (We avoid U+FFFE and U+FFFF as they're invalid codepoints) 26*e4f25e4aSBram Moolenaar# 27*e4f25e4aSBram Moolenaar# Four bytes: 28*e4f25e4aSBram Moolenaar# U+10000 = 00001 00000000 00000000 => 000 010000 000000 000000 29*e4f25e4aSBram Moolenaar# => 11110000 10010000 10000000 10000000 = F0 90 80 80 30*e4f25e4aSBram Moolenaar# U+1FFFFF = 11111 11111111 11111111 => 111 111111 111111 111111 31*e4f25e4aSBram Moolenaar# => 11110111 10111111 10111111 10111111 = F7 BF BF BF 32*e4f25e4aSBram Moolenaar 33*e4f25e4aSBram Moolenaar!2 byte 34*e4f25e4aSBram MoolenaarENCIN "\xC2\x80\xDF\xBF" 35*e4f25e4aSBram Moolenaar encout 0x0080, 0x07FF 36*e4f25e4aSBram Moolenaar 37*e4f25e4aSBram Moolenaar!3 byte 38*e4f25e4aSBram MoolenaarENCIN "\xE0\xA0\x80\xEF\xBF\xBD" 39*e4f25e4aSBram Moolenaar encout 0x0800,0xFFFD 40*e4f25e4aSBram Moolenaar 41*e4f25e4aSBram Moolenaar!4 byte 42*e4f25e4aSBram MoolenaarENCIN "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF" 43*e4f25e4aSBram Moolenaar encout 0x10000,0x1fffff 44*e4f25e4aSBram Moolenaar 45*e4f25e4aSBram Moolenaar# Next up, we check some invalid sequences 46*e4f25e4aSBram Moolenaar# + Early termination (back to low bytes too soon) 47*e4f25e4aSBram Moolenaar# + Early restart (another sequence introduction before the previous one was finished) 48*e4f25e4aSBram Moolenaar 49*e4f25e4aSBram Moolenaar!Early termination 50*e4f25e4aSBram MoolenaarENCIN "\xC2!" 51*e4f25e4aSBram Moolenaar encout 0xfffd,0x21 52*e4f25e4aSBram Moolenaar 53*e4f25e4aSBram MoolenaarENCIN "\xE0!\xE0\xA0!" 54*e4f25e4aSBram Moolenaar encout 0xfffd,0x21,0xfffd,0x21 55*e4f25e4aSBram Moolenaar 56*e4f25e4aSBram MoolenaarENCIN "\xF0!\xF0\x90!\xF0\x90\x80!" 57*e4f25e4aSBram Moolenaar encout 0xfffd,0x21,0xfffd,0x21,0xfffd,0x21 58*e4f25e4aSBram Moolenaar 59*e4f25e4aSBram Moolenaar!Early restart 60*e4f25e4aSBram MoolenaarENCIN "\xC2\xC2\x90" 61*e4f25e4aSBram Moolenaar encout 0xfffd,0x0090 62*e4f25e4aSBram Moolenaar 63*e4f25e4aSBram MoolenaarENCIN "\xE0\xC2\x90\xE0\xA0\xC2\x90" 64*e4f25e4aSBram Moolenaar encout 0xfffd,0x0090,0xfffd,0x0090 65*e4f25e4aSBram Moolenaar 66*e4f25e4aSBram MoolenaarENCIN "\xF0\xC2\x90\xF0\x90\xC2\x90\xF0\x90\x80\xC2\x90" 67*e4f25e4aSBram Moolenaar encout 0xfffd,0x0090,0xfffd,0x0090,0xfffd,0x0090 68*e4f25e4aSBram Moolenaar 69*e4f25e4aSBram Moolenaar# Test the overlong sequences by giving an overlong encoding of U+0000 and 70*e4f25e4aSBram Moolenaar# an encoding of the highest codepoint still too short 71*e4f25e4aSBram Moolenaar# 72*e4f25e4aSBram Moolenaar# Two bytes: 73*e4f25e4aSBram Moolenaar# U+0000 = C0 80 74*e4f25e4aSBram Moolenaar# U+007F = 000 01111111 => 00001 111111 => 75*e4f25e4aSBram Moolenaar# => 11000001 10111111 => C1 BF 76*e4f25e4aSBram Moolenaar# 77*e4f25e4aSBram Moolenaar# Three bytes: 78*e4f25e4aSBram Moolenaar# U+0000 = E0 80 80 79*e4f25e4aSBram Moolenaar# U+07FF = 00000111 11111111 => 0000 011111 111111 80*e4f25e4aSBram Moolenaar# => 11100000 10011111 10111111 = E0 9F BF 81*e4f25e4aSBram Moolenaar# 82*e4f25e4aSBram Moolenaar# Four bytes: 83*e4f25e4aSBram Moolenaar# U+0000 = F0 80 80 80 84*e4f25e4aSBram Moolenaar# U+FFFF = 11111111 11111111 => 000 001111 111111 111111 85*e4f25e4aSBram Moolenaar# => 11110000 10001111 10111111 10111111 = F0 8F BF BF 86*e4f25e4aSBram Moolenaar 87*e4f25e4aSBram Moolenaar!Overlong 88*e4f25e4aSBram MoolenaarENCIN "\xC0\x80\xC1\xBF" 89*e4f25e4aSBram Moolenaar encout 0xfffd,0xfffd 90*e4f25e4aSBram Moolenaar 91*e4f25e4aSBram MoolenaarENCIN "\xE0\x80\x80\xE0\x9F\xBF" 92*e4f25e4aSBram Moolenaar encout 0xfffd,0xfffd 93*e4f25e4aSBram Moolenaar 94*e4f25e4aSBram MoolenaarENCIN "\xF0\x80\x80\x80\xF0\x8F\xBF\xBF" 95*e4f25e4aSBram Moolenaar encout 0xfffd,0xfffd 96*e4f25e4aSBram Moolenaar 97*e4f25e4aSBram Moolenaar# UTF-16 surrogates U+D800 and U+DFFF 98*e4f25e4aSBram Moolenaar!UTF-16 Surrogates 99*e4f25e4aSBram MoolenaarENCIN "\xED\xA0\x80\xED\xBF\xBF" 100*e4f25e4aSBram Moolenaar encout 0xfffd,0xfffd 101*e4f25e4aSBram Moolenaar 102*e4f25e4aSBram Moolenaar!Split write 103*e4f25e4aSBram MoolenaarENCIN "\xC2" 104*e4f25e4aSBram MoolenaarENCIN "\xA0" 105*e4f25e4aSBram Moolenaar encout 0x000A0 106*e4f25e4aSBram Moolenaar 107*e4f25e4aSBram MoolenaarENCIN "\xE0" 108*e4f25e4aSBram MoolenaarENCIN "\xA0\x80" 109*e4f25e4aSBram Moolenaar encout 0x00800 110*e4f25e4aSBram MoolenaarENCIN "\xE0\xA0" 111*e4f25e4aSBram MoolenaarENCIN "\x80" 112*e4f25e4aSBram Moolenaar encout 0x00800 113*e4f25e4aSBram Moolenaar 114*e4f25e4aSBram MoolenaarENCIN "\xF0" 115*e4f25e4aSBram MoolenaarENCIN "\x90\x80\x80" 116*e4f25e4aSBram Moolenaar encout 0x10000 117*e4f25e4aSBram MoolenaarENCIN "\xF0\x90" 118*e4f25e4aSBram MoolenaarENCIN "\x80\x80" 119*e4f25e4aSBram Moolenaar encout 0x10000 120*e4f25e4aSBram MoolenaarENCIN "\xF0\x90\x80" 121*e4f25e4aSBram MoolenaarENCIN "\x80" 122*e4f25e4aSBram Moolenaar encout 0x10000 123