Lolly 1.4.27
Loading...
Searching...
No Matches
unicode.cpp
Go to the documentation of this file.
1
2/******************************************************************************
3 * MODULE : unicode.cpp
4 * DESCRIPTION: routines on Unicode
5 * COPYRIGHT : (C) 2023-2024 Darcy Shen
6 *******************************************************************************
7 * This software falls under the GNU general public license version 3 or later.
8 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
9 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
10 ******************************************************************************/
11
12#include "unicode.hpp"
13#include "tbox/tbox.h"
14
15namespace lolly {
16namespace data {
17
20 if (/* 0x0 <= code && */ code <= 0x7F) {
21 // 0x0ddddddd
22 return string ((char) code);
23 }
24 else if (0x80 <= code && code <= 0x7FF) {
25 // 0x110ddddd 0x10dddddd
26 string str (2);
27 str[0]= ((code >> 6) & 0x1F) | 0xC0;
28 str[1]= (code & 0x3F) | 0x80;
29 return str;
30 }
31 else if (0x800 <= code && code <= 0xFFFF) {
32 // 0x1110dddd 0x10dddddd 0x10dddddd
33 string str (3);
34 str[0]= ((code >> 12) & 0x0F) | 0xE0;
35 str[1]= ((code >> 6) & 0x3F) | 0x80;
36 str[2]= (code & 0x3F) | 0x80;
37 return str;
38 }
39 else if (0x10000 <= code && code <= 0x1FFFFF) {
40 // 0x11110uuu 0x10zzzzzz 0x10yyyyyy 0x10xxxxxx
41 string str (4);
42 str[0]= ((code >> 18) & 0x07) | 0xF0;
43 str[1]= ((code >> 12) & 0x3F) | 0x80;
44 str[2]= ((code >> 6) & 0x3F) | 0x80;
45 str[3]= (code & 0x3F) | 0x80;
46 return str;
47 }
48 else return "";
49}
50
53 unsigned char c= s[i];
54 if ((0x80 & c) == 0) {
55 // 0x0ddddddd
56 i++;
57 return (uint32_t) c;
58 }
59 uint32_t code;
60 int trail;
61 if ((0xE0 & c) == 0xC0) {
62 // 0x110ddddd 0x10dddddd
63 trail= 1;
64 code = c & 0x1F;
65 }
66 else if ((0xF0 & c) == 0xE0) {
67 // 0x1110dddd 0x10dddddd 0x10dddddd
68 trail= 2;
69 code = c & 0x0F;
70 }
71 else if ((0xF8 & c) == 0xF0) {
72 // 0x11110dddd 0x10dddddd 0x10dddddd 0x10dddddd
73 trail= 3;
74 code = c & 0x07;
75 }
76 else {
77 // failsafe
78 // cout << "failsafe: " << c << " (" << (unsigned int)(c) << ")\n";
79 i++;
80 return (uint32_t) c;
81 }
82 int start= i - 1;
83 for (; trail > 0; trail--) {
84 i++;
85 if (i >= N (s)) i= N (s) - 1;
86 c= s[i];
87 if ((0xC0 & c) == 0x80) code= (code << 6) | (c & 0x3F);
88 else {
89 i= start + 1;
90 c= s[i++];
91 return c;
92 }
93 }
94 i++;
95 return code;
96}
97
98string
100 if (code <= 0x7f) return "ascii";
101 else if (code >= 0x80 && code <= 0x37f) return "latin";
102 else if (code >= 0x380 && code <= 0x3ff) return "greek";
103 else if (code >= 0x400 && code <= 0x4ff) return "cyrillic";
104 else if (code >= 0x2460 && code <= 0x24ff) return "enclosed_alphanumerics";
105 else if (code >= 0x3000 && code <= 0x303f) return "cjk";
106 else if (code >= 0x4e00 && code <= 0x9fcc) return "cjk";
107 else if (code >= 0xff00 && code <= 0xffef) return "cjk";
108 else if (code >= 0x3040 && code <= 0x309F) return "hiragana";
109 else if (code >= 0xac00 && code <= 0xd7af) return "hangul";
110 else if (code >= 0x2000 && code <= 0x23ff) return "mathsymbols";
111 else if (code >= 0x2900 && code <= 0x2e7f) return "mathextra";
112 else if (code >= 0x1d400 && code <= 0x1d7ff) return "mathletters";
113 else return "";
114}
115
116bool
118 int n= N (s);
119 for (int i= 0; i < n; i++)
120 if (s[i] == '<' && i + 1 < n && s[i + 1] == '#') {
121 int start= i + 2;
122 i = i + 2;
123 while (i < n && s[i] != '>')
124 i++;
125 string r= s (start, i);
126 if ("4E00" <= r && r <= "9FBF") continue;
127 else return false;
128 }
129 else {
130 return false;
131 }
132 return true;
133}
134
135bool
137 int n= N (s);
138 for (int i= 0; i < n; i++)
139 if (s[i] == '<' && i + 1 < n && s[i + 1] == '#') {
140 int start= i + 2;
141 i = i + 2;
142 while (i < n && s[i] != '>')
143 i++;
144 string r= s (start, i);
145 if ("4E00" <= r && r <= "9FBF") return true;
146 else continue;
147 }
148 else {
149 continue;
150 }
151 return false;
152}
153
154string
158 tb_long_t osize= (tb_long_t) (isize << 2);
160
161 for (tb_size_t i= 0; i < isize; i++) {
162 idata[i]= (tb_byte_t) s_u16[i];
163 }
164
167
168 string ret ((int) osize);
169 for (tb_size_t i= 0; i < osize; i++) {
170 ret[i]= (char) odata[i];
171 }
172 if (idata) tb_free (idata);
173 if (odata) tb_free (odata);
174 return ret;
175}
176
177#if defined(OS_MINGW) || defined(OS_WIN)
178string
179wchar_to_utf8 (const wchar_t* s_u16) {
183 tb_long_t osize = (tb_long_t) (isize << 2);
185
186 for (tb_size_t i= 0; i < wchar_size; i++) {
188 tb_byte_t high = (tb_byte_t) (bytes >> 8);
189 tb_byte_t low = (tb_byte_t) (bytes & 0x00FF);
190 idata[2 * i] = high;
191 idata[2 * i + 1]= low;
192 }
193
196
197 string ret ((int) osize);
198 for (tb_size_t i= 0; i < osize; i++) {
199 ret[i]= (char) odata[i];
200 }
201 if (idata) tb_free (idata);
202 if (odata) tb_free (odata);
203 return ret;
204}
205#endif
206
207string
209 tb_long_t osize= (tb_long_t) (N (s_u8) << 2);
211
214
215 string ret ((int) osize);
216 for (tb_size_t i= 0; i < osize; i++) {
217 ret[i]= odata[i];
218 }
219 if (odata) tb_free (odata);
220 return ret;
221}
222
223} // namespace data
224} // namespace lolly
The list class represents a linked list.
Definition list.hpp:48
string_u8 encode_as_utf8(uint32_t code)
Encode 4 bytes as string_u8.
Definition unicode.cpp:19
string unicode_get_range(int code)
Definition unicode.cpp:99
uint32_t decode_from_utf8(string_u8 s, int &i)
Decode string_u8 string as 4 bytes at position i.
Definition unicode.cpp:52
bool is_cjk_unified_ideographs(string s)
Checks if a string contains only CJK Unified Ideographs.
Definition unicode.cpp:117
string utf16_to_utf8(string s_u16)
Convert UTF-16 string to UTF-8 string.
Definition unicode.cpp:155
int N(lolly_tree< T > t)
string utf8_to_utf16(string s_u8)
Convert UTF-8 string to UTF-16 string.
Definition unicode.cpp:208
bool has_cjk_unified_ideographs(string s)
Checks if a string contains any CJK Unified Ideographs.
Definition unicode.cpp:136