1INIT 2WANTENCODING 3 4!Low 5ENCIN "123" 6 encout 0x31,0x32,0x33 7 8# We want to prove the UTF-8 parser correctly handles all the sequences. 9# Easy way to do this is to check it does low/high boundary cases, as that 10# leaves only two for each sequence length 11# 12# These ranges are therefore: 13# 14# Two bytes: 15# U+0080 = 000 10000000 => 00010 000000 16# => 11000010 10000000 = C2 80 17# U+07FF = 111 11111111 => 11111 111111 18# => 11011111 10111111 = DF BF 19# 20# Three bytes: 21# U+0800 = 00001000 00000000 => 0000 100000 000000 22# => 11100000 10100000 10000000 = E0 A0 80 23# U+FFFD = 11111111 11111101 => 1111 111111 111101 24# => 11101111 10111111 10111101 = EF BF BD 25# (We avoid U+FFFE and U+FFFF as they're invalid codepoints) 26# 27# Four bytes: 28# U+10000 = 00001 00000000 00000000 => 000 010000 000000 000000 29# => 11110000 10010000 10000000 10000000 = F0 90 80 80 30# U+1FFFFF = 11111 11111111 11111111 => 111 111111 111111 111111 31# => 11110111 10111111 10111111 10111111 = F7 BF BF BF 32 33!2 byte 34ENCIN "\xC2\x80\xDF\xBF" 35 encout 0x0080, 0x07FF 36 37!3 byte 38ENCIN "\xE0\xA0\x80\xEF\xBF\xBD" 39 encout 0x0800,0xFFFD 40 41!4 byte 42ENCIN "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF" 43 encout 0x10000,0x1fffff 44 45# Next up, we check some invalid sequences 46# + Early termination (back to low bytes too soon) 47# + Early restart (another sequence introduction before the previous one was finished) 48 49!Early termination 50ENCIN "\xC2!" 51 encout 0xfffd,0x21 52 53ENCIN "\xE0!\xE0\xA0!" 54 encout 0xfffd,0x21,0xfffd,0x21 55 56ENCIN "\xF0!\xF0\x90!\xF0\x90\x80!" 57 encout 0xfffd,0x21,0xfffd,0x21,0xfffd,0x21 58 59!Early restart 60ENCIN "\xC2\xC2\x90" 61 encout 0xfffd,0x0090 62 63ENCIN "\xE0\xC2\x90\xE0\xA0\xC2\x90" 64 encout 0xfffd,0x0090,0xfffd,0x0090 65 66ENCIN "\xF0\xC2\x90\xF0\x90\xC2\x90\xF0\x90\x80\xC2\x90" 67 encout 0xfffd,0x0090,0xfffd,0x0090,0xfffd,0x0090 68 69# Test the overlong sequences by giving an overlong encoding of U+0000 and 70# an encoding of the highest codepoint still too short 71# 72# Two bytes: 73# U+0000 = C0 80 74# U+007F = 000 01111111 => 00001 111111 => 75# => 11000001 10111111 => C1 BF 76# 77# Three bytes: 78# U+0000 = E0 80 80 79# U+07FF = 00000111 11111111 => 0000 011111 111111 80# => 11100000 10011111 10111111 = E0 9F BF 81# 82# Four bytes: 83# U+0000 = F0 80 80 80 84# U+FFFF = 11111111 11111111 => 000 001111 111111 111111 85# => 11110000 10001111 10111111 10111111 = F0 8F BF BF 86 87!Overlong 88ENCIN "\xC0\x80\xC1\xBF" 89 encout 0xfffd,0xfffd 90 91ENCIN "\xE0\x80\x80\xE0\x9F\xBF" 92 encout 0xfffd,0xfffd 93 94ENCIN "\xF0\x80\x80\x80\xF0\x8F\xBF\xBF" 95 encout 0xfffd,0xfffd 96 97# UTF-16 surrogates U+D800 and U+DFFF 98!UTF-16 Surrogates 99ENCIN "\xED\xA0\x80\xED\xBF\xBF" 100 encout 0xfffd,0xfffd 101 102!Split write 103ENCIN "\xC2" 104ENCIN "\xA0" 105 encout 0x000A0 106 107ENCIN "\xE0" 108ENCIN "\xA0\x80" 109 encout 0x00800 110ENCIN "\xE0\xA0" 111ENCIN "\x80" 112 encout 0x00800 113 114ENCIN "\xF0" 115ENCIN "\x90\x80\x80" 116 encout 0x10000 117ENCIN "\xF0\x90" 118ENCIN "\x80\x80" 119 encout 0x10000 120ENCIN "\xF0\x90\x80" 121ENCIN "\x80" 122 encout 0x10000 123