Line data Source code
1 : /* 2 : * Copyright (C) 1999 Tom Tromey 3 : * Copyright (C) 2000 Red Hat, Inc. 4 : * Copyright (C) 2004-2024 Savoir-faire Linux Inc. 5 : * 6 : * Author: Pascal Potvin <pascal.potvin@extenway.com> 7 : * 8 : * This program is free software: you can redistribute it and/or modify 9 : * it under the terms of the GNU General Public License as published by 10 : * the Free Software Foundation, either version 3 of the License, or 11 : * (at your option) any later version. 12 : * 13 : * This program is distributed in the hope that it will be useful, 14 : * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 : * GNU General Public License for more details. 17 : * 18 : * You should have received a copy of the GNU General Public License 19 : * along with this program. If not, see <https://www.gnu.org/licenses/>. 20 : */ 21 : 22 : #include <cstring> 23 : #include <cassert> 24 : #include "connectivity/utf8_utils.h" 25 : 26 : #if defined(_MSC_VER) 27 : #include <BaseTsd.h> 28 : using ssize_t = SSIZE_T; 29 : #endif 30 : 31 : /* 32 : * The LIKELY and UNLIKELY macros let the programmer give hints to 33 : * the compiler about the expected result of an expression. Some compilers 34 : * can use this information for optimizations. 35 : */ 36 : #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) 37 : #define LIKELY(expr) (__builtin_expect(expr, 1)) 38 : #define UNLIKELY(expr) (__builtin_expect(expr, 0)) 39 : #else 40 : #define LIKELY(expr) (expr) 41 : #define UNLIKELY(expr) (expr) 42 : #endif 43 : 44 : /* 45 : * Check whether a Unicode (5.2) char is in a valid range. 46 : * 47 : * The first check comes from the Unicode guarantee to never encode 48 : * a point above 0x0010ffff, since UTF-16 is unable to represent it. 49 : * 50 : * The second check covers surrogate pairs (category Cs). 51 : * 52 : * @param Char the character 53 : */ 54 : #define UNICODE_VALID(Char) ((Char) < 0x110000 && (((Char) &0xFFFFF800) != 0xD800)) 55 : 56 : #define CONTINUATION_CHAR \ 57 : if ((*(unsigned char*) p & 0xc0) != 0x80) /* 10xxxxxx */ \ 58 : goto error; \ 59 : val <<= 6; \ 60 : val |= (*(unsigned char*) p) & 0x3f; 61 : 62 : namespace jami { 63 : 64 : bool utf8_validate_c_str(const char* str, ssize_t max_len, const char** end); 65 : 66 : static const char* 67 0 : fast_validate(const char* str) 68 : { 69 0 : char32_t val = 0; 70 0 : char32_t min = 0; 71 : const char* p; 72 : 73 0 : for (p = str; *p; p++) { 74 0 : if (*(unsigned char*) p < 128) 75 : /* done */; 76 : else { 77 : const char* last; 78 : 79 0 : last = p; 80 : 81 0 : if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */ 82 0 : if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0)) 83 0 : goto error; 84 : 85 0 : p++; 86 : 87 0 : if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */ 88 0 : goto error; 89 : } else { 90 0 : if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */ 91 0 : min = (1 << 11); 92 0 : val = *(unsigned char*) p & 0x0f; 93 0 : goto TWO_REMAINING; 94 0 : } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */ 95 0 : min = (1 << 16); 96 0 : val = *(unsigned char*) p & 0x07; 97 : } else 98 0 : goto error; 99 : 100 0 : p++; 101 0 : CONTINUATION_CHAR; 102 0 : TWO_REMAINING: 103 0 : p++; 104 0 : CONTINUATION_CHAR; 105 0 : p++; 106 0 : CONTINUATION_CHAR; 107 : 108 0 : if (UNLIKELY(val < min)) 109 0 : goto error; 110 : 111 0 : if (UNLIKELY(!UNICODE_VALID(val))) 112 0 : goto error; 113 : } 114 : 115 0 : continue; 116 : 117 0 : error: 118 0 : return last; 119 0 : } 120 : } 121 : 122 0 : return p; 123 : } 124 : 125 : static const char* 126 782 : fast_validate_len(const char* str, ssize_t max_len) 127 : { 128 782 : char32_t val = 0; 129 782 : char32_t min = 0; 130 : const char* p; 131 : 132 782 : assert(max_len >= 0); 133 : 134 55987 : for (p = str; ((p - str) < max_len) && *p; p++) { 135 55205 : if (*(unsigned char*) p < 128) 136 : /* done */; 137 : else { 138 : const char* last; 139 : 140 0 : last = p; 141 : 142 0 : if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */ 143 0 : if (UNLIKELY(max_len - (p - str) < 2)) 144 0 : goto error; 145 : 146 0 : if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0)) 147 0 : goto error; 148 : 149 0 : p++; 150 : 151 0 : if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */ 152 0 : goto error; 153 : } else { 154 0 : if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */ 155 0 : if (UNLIKELY(max_len - (p - str) < 3)) 156 0 : goto error; 157 : 158 0 : min = (1 << 11); 159 0 : val = *(unsigned char*) p & 0x0f; 160 0 : goto TWO_REMAINING; 161 0 : } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */ 162 0 : if (UNLIKELY(max_len - (p - str) < 4)) 163 0 : goto error; 164 : 165 0 : min = (1 << 16); 166 0 : val = *(unsigned char*) p & 0x07; 167 : } else 168 0 : goto error; 169 : 170 0 : p++; 171 0 : CONTINUATION_CHAR; 172 0 : TWO_REMAINING: 173 0 : p++; 174 0 : CONTINUATION_CHAR; 175 0 : p++; 176 0 : CONTINUATION_CHAR; 177 : 178 0 : if (UNLIKELY(val < min)) 179 0 : goto error; 180 : 181 0 : if (UNLIKELY(!UNICODE_VALID(val))) 182 0 : goto error; 183 : } 184 : 185 0 : continue; 186 : 187 0 : error: 188 0 : return last; 189 0 : } 190 : } 191 : 192 782 : return p; 193 : } 194 : 195 : /** 196 : * utf8_validate_c_str: 197 : * @str: a pointer to character data 198 : * @max_len: max bytes to validate, or -1 to go until NULL 199 : * @end: return location for end of valid data 200 : * 201 : * Validates UTF-8 encoded text. @str is the text to validate; 202 : * if @str is nul-terminated, then @max_len can be -1, otherwise 203 : * @max_len should be the number of bytes to validate. 204 : * If @end is non-%NULL, then the end of the valid range 205 : * will be stored there (i.e. the start of the first invalid 206 : * character if some bytes were invalid, or the end of the text 207 : * being validated otherwise). 208 : * 209 : * Note that utf8_validate() returns %false if @max_len is 210 : * positive and any of the @max_len bytes are nul. 211 : * 212 : * Returns true if all of @str was valid. Dbus requires valid UTF-8 as input; 213 : * sip packets should also be encoded in utf8; so data read from a file or the 214 : * network should be checked with utf8_validate() before doing anything else 215 : * with it. 216 : * 217 : * Returns: true if the text was valid UTF-8 218 : */ 219 : bool 220 107 : utf8_validate_c_str(const char* str, ssize_t max_len, const char** end) 221 : { 222 : const char* p; 223 : 224 107 : if (max_len < 0) 225 0 : p = fast_validate(str); 226 : else 227 107 : p = fast_validate_len(str, max_len); 228 : 229 107 : if (end) 230 107 : *end = p; 231 : 232 107 : if ((max_len >= 0 && p != str + max_len) || (max_len < 0 && *p != '\0')) 233 0 : return false; 234 : else 235 107 : return true; 236 : } 237 : 238 : bool 239 675 : utf8_validate(std::string_view str) 240 : { 241 675 : const char* p = fast_validate_len(str.data(), str.size()); 242 : 243 675 : return (*p == '\0'); 244 : } 245 : 246 : std::string 247 107 : utf8_make_valid(std::string_view name) 248 : { 249 107 : ssize_t remaining_bytes = name.size(); 250 : ssize_t valid_bytes; 251 107 : const char* remainder = name.data(); 252 : const char* invalid; 253 107 : char* str = NULL; 254 : char* pos; 255 : 256 107 : while (remaining_bytes != 0) { 257 107 : if (utf8_validate_c_str(remainder, remaining_bytes, &invalid)) 258 107 : break; 259 : 260 0 : valid_bytes = invalid - remainder; 261 : 262 0 : if (str == NULL) 263 : // If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size() 264 0 : str = new char[3 * remaining_bytes]; 265 : 266 0 : pos = str; 267 : 268 0 : strncpy(pos, remainder, valid_bytes); 269 0 : pos += valid_bytes; 270 : 271 : /* append U+FFFD REPLACEMENT CHARACTER */ 272 0 : pos[0] = '\357'; 273 0 : pos[1] = '\277'; 274 0 : pos[2] = '\275'; 275 : 276 0 : pos += 3; 277 : 278 0 : remaining_bytes -= valid_bytes + 1; 279 0 : remainder = invalid + 1; 280 : } 281 : 282 107 : if (str == NULL) 283 107 : return std::string(name); 284 : 285 0 : strncpy(pos, remainder, remaining_bytes); 286 0 : pos += remaining_bytes; 287 : 288 0 : std::string answer(str, pos - str); 289 0 : assert(utf8_validate(answer)); 290 : 291 0 : delete[] str; 292 : 293 0 : return answer; 294 0 : } 295 : 296 : } // namespace jami