1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO_B: u8 = 0b1100_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
const TAG_FOUR_B: u8 = 0b1111_0000;
const MAX_ONE_B: u32 = 0x80;
const MAX_TWO_B: u32 = 0x800;
const MAX_THREE_B: u32 = 0x10000;
#[inline]
pub fn utf8_width(byte: u8) -> u8 {
match byte {
0 ... 127 => 1,
192 ... 223 => 2,
224 ... 239 => 3,
240 ... 255 => 4,
_ => panic!("Invalid byte sequences"),
}
}
#[inline]
pub fn len_utf8(ch: char) -> u8 {
let code = ch as u32;
if code < MAX_ONE_B {
1
} else if code < MAX_TWO_B {
2
} else if code < MAX_THREE_B {
3
} else {
4
}
}
pub fn encode(c: char, buffer: &mut [u8; 4]) -> u8 {
match c {
'\u{0000}' ... '\u{007f}' => {
buffer[0] = c as u8;
1
},
'\u{0080}' ... '\u{07ff}' => {
let n = c as u32;
buffer[0] = (n >> 6 & 0x1F) as u8 | TAG_TWO_B;
buffer[1] = (n & 0x3F) as u8 | TAG_CONT;;
2
},
'\u{0800}' ... '\u{ffff}' => {
let n = c as u32;
buffer[0] = (n >> 12 & 0x0F) as u8 | TAG_THREE_B;
buffer[1] = (n >> 6 & 0x3F) as u8 | TAG_CONT;
buffer[2] = (n & 0x3F) as u8 | TAG_CONT;
3
},
'\u{10000}' ... '\u{10ffff}' => {
let n = c as u32;
buffer[0] = (n >> 18 & 0x07) as u8 | TAG_FOUR_B;
buffer[1] = (n >> 12 & 0x3F) as u8 | TAG_CONT;
buffer[2] = (n >> 6 & 0x3F) as u8 | TAG_CONT;
buffer[3] = (n & 0x3F) as u8 | TAG_CONT;
4
}
}
}
pub fn decode(buffer: &[u8], idx: usize) -> u32 {
let first = buffer[idx];
match first {
0 ... 127 => first as u32,
192 ... 223 => {
let a = (first & 0b_0001_1111u8) as u16;
let b = (buffer[idx + 1] & 0b_0011_1111u8) as u16;
let n = (a << 6) | b;
n as u32
},
224 ... 239 => {
let a = (first & 0b_0000_1111u8) as u32;
let b = (buffer[idx + 1] & 0b_0011_1111u8) as u32;
let c = (buffer[idx + 2] & 0b_0011_1111u8) as u32;
let n = (((a << 6) | b) << 6) | c;
n
},
240 ... 255 => {
let a = (first & 0b_0000_0111u8) as u32;
let b = (buffer[idx + 1] & 0b_0011_1111u8) as u32;
let c = (buffer[idx + 2] & 0b_0011_1111u8) as u32;
let d = (buffer[idx + 3] & 0b_0011_1111u8) as u32;
let n = ( ( ( ( (a << 6) | b) << 6) | c) << 6) | d;
n
},
_ => panic!("Invalid byte sequences"),
}
}
#[test]
fn test_decode_utf8_seqs() {
let buffer: Vec<u8> = vec![
0b00100100,
0b11000010, 0b10100010,
0b11100000, 0b10100100, 0b10111001,
0b11100010, 0b10000010, 0b10101100,
0b11110000, 0b10010000, 0b10001101, 0b10001000,
];
assert_eq!(std::char::from_u32(decode(&buffer, 0)), Some('$'));
assert_eq!(std::char::from_u32(decode(&buffer, 1)), Some('¢'));
assert_eq!(std::char::from_u32(decode(&buffer, 3)), Some('ह'));
assert_eq!(std::char::from_u32(decode(&buffer, 6)), Some('€'));
assert_eq!(std::char::from_u32(decode(&buffer, 9)), Some('𐍈'));
}