Line data Source code
1 : /* 2 : * Copyright (C) 1999 Tom Tromey 3 : * Copyright (C) 2000 Red Hat, Inc. 4 : * Copyright (C) 2004-2024 Savoir-faire Linux Inc. 5 : * 6 : * Author: Pascal Potvin <pascal.potvin@extenway.com> 7 : * 8 : * This program is free software; you can redistribute it and/or modify 9 : * it under the terms of the GNU General Public License as published by 10 : * the Free Software Foundation; either version 3 of the License, or 11 : * (at your option) any later version. 12 : * 13 : * This program is distributed in the hope that it will be useful, 14 : * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 : * GNU General Public License for more details. 17 : * 18 : * You should have received a copy of the GNU General Public License 19 : * along with this program; if not, write to the Free Software 20 : * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 21 : */ 22 : 23 : #include <cstring> 24 : #include <cassert> 25 : #include "connectivity/utf8_utils.h" 26 : 27 : #if defined(_MSC_VER) 28 : #include <BaseTsd.h> 29 : using ssize_t = SSIZE_T; 30 : #endif 31 : 32 : /* 33 : * The LIKELY and UNLIKELY macros let the programmer give hints to 34 : * the compiler about the expected result of an expression. Some compilers 35 : * can use this information for optimizations. 36 : */ 37 : #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) 38 : #define LIKELY(expr) (__builtin_expect(expr, 1)) 39 : #define UNLIKELY(expr) (__builtin_expect(expr, 0)) 40 : #else 41 : #define LIKELY(expr) (expr) 42 : #define UNLIKELY(expr) (expr) 43 : #endif 44 : 45 : /* 46 : * Check whether a Unicode (5.2) char is in a valid range. 47 : * 48 : * The first check comes from the Unicode guarantee to never encode 49 : * a point above 0x0010ffff, since UTF-16 couldn't represent it. 50 : * 51 : * The second check covers surrogate pairs (category Cs). 52 : * 53 : * @param Char the character 54 : */ 55 : #define UNICODE_VALID(Char) ((Char) < 0x110000 && (((Char) &0xFFFFF800) != 0xD800)) 56 : 57 : #define CONTINUATION_CHAR \ 58 : if ((*(unsigned char*) p & 0xc0) != 0x80) /* 10xxxxxx */ \ 59 : goto error; \ 60 : val <<= 6; \ 61 : val |= (*(unsigned char*) p) & 0x3f; 62 : 63 : namespace jami { 64 : 65 : bool utf8_validate_c_str(const char* str, ssize_t max_len, const char** end); 66 : 67 : static const char* 68 0 : fast_validate(const char* str) 69 : { 70 0 : char32_t val = 0; 71 0 : char32_t min = 0; 72 : const char* p; 73 : 74 0 : for (p = str; *p; p++) { 75 0 : if (*(unsigned char*) p < 128) 76 : /* done */; 77 : else { 78 : const char* last; 79 : 80 0 : last = p; 81 : 82 0 : if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */ 83 0 : if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0)) 84 0 : goto error; 85 : 86 0 : p++; 87 : 88 0 : if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */ 89 0 : goto error; 90 : } else { 91 0 : if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */ 92 0 : min = (1 << 11); 93 0 : val = *(unsigned char*) p & 0x0f; 94 0 : goto TWO_REMAINING; 95 0 : } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */ 96 0 : min = (1 << 16); 97 0 : val = *(unsigned char*) p & 0x07; 98 : } else 99 0 : goto error; 100 : 101 0 : p++; 102 0 : CONTINUATION_CHAR; 103 0 : TWO_REMAINING: 104 0 : p++; 105 0 : CONTINUATION_CHAR; 106 0 : p++; 107 0 : CONTINUATION_CHAR; 108 : 109 0 : if (UNLIKELY(val < min)) 110 0 : goto error; 111 : 112 0 : if (UNLIKELY(!UNICODE_VALID(val))) 113 0 : goto error; 114 : } 115 : 116 0 : continue; 117 : 118 0 : error: 119 0 : return last; 120 0 : } 121 : } 122 : 123 0 : return p; 124 : } 125 : 126 : static const char* 127 53096 : fast_validate_len(const char* str, ssize_t max_len) 128 : { 129 53096 : char32_t val = 0; 130 53096 : char32_t min = 0; 131 : const char* p; 132 : 133 53096 : assert(max_len >= 0); 134 : 135 5749672 : for (p = str; ((p - str) < max_len) && *p; p++) { 136 5696576 : if (*(unsigned char*) p < 128) 137 : /* done */; 138 : else { 139 : const char* last; 140 : 141 0 : last = p; 142 : 143 0 : if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */ 144 0 : if (UNLIKELY(max_len - (p - str) < 2)) 145 0 : goto error; 146 : 147 0 : if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0)) 148 0 : goto error; 149 : 150 0 : p++; 151 : 152 0 : if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */ 153 0 : goto error; 154 : } else { 155 0 : if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */ 156 0 : if (UNLIKELY(max_len - (p - str) < 3)) 157 0 : goto error; 158 : 159 0 : min = (1 << 11); 160 0 : val = *(unsigned char*) p & 0x0f; 161 0 : goto TWO_REMAINING; 162 0 : } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */ 163 0 : if (UNLIKELY(max_len - (p - str) < 4)) 164 0 : goto error; 165 : 166 0 : min = (1 << 16); 167 0 : val = *(unsigned char*) p & 0x07; 168 : } else 169 0 : goto error; 170 : 171 0 : p++; 172 0 : CONTINUATION_CHAR; 173 0 : TWO_REMAINING: 174 0 : p++; 175 0 : CONTINUATION_CHAR; 176 0 : p++; 177 0 : CONTINUATION_CHAR; 178 : 179 0 : if (UNLIKELY(val < min)) 180 0 : goto error; 181 : 182 0 : if (UNLIKELY(!UNICODE_VALID(val))) 183 0 : goto error; 184 : } 185 : 186 0 : continue; 187 : 188 0 : error: 189 0 : return last; 190 0 : } 191 : } 192 : 193 53096 : return p; 194 : } 195 : 196 : /** 197 : * utf8_validate_c_str: 198 : * @str: a pointer to character data 199 : * @max_len: max bytes to validate, or -1 to go until NULL 200 : * @end: return location for end of valid data 201 : * 202 : * Validates UTF-8 encoded text. @str is the text to validate; 203 : * if @str is nul-terminated, then @max_len can be -1, otherwise 204 : * @max_len should be the number of bytes to validate. 205 : * If @end is non-%NULL, then the end of the valid range 206 : * will be stored there (i.e. the start of the first invalid 207 : * character if some bytes were invalid, or the end of the text 208 : * being validated otherwise). 209 : * 210 : * Note that utf8_validate() returns %false if @max_len is 211 : * positive and any of the @max_len bytes are nul. 212 : * 213 : * Returns true if all of @str was valid. Dbus requires valid UTF-8 as input; 214 : * sip packets should also be encoded in utf8; so data read from a file or the 215 : * network should be checked with utf8_validate() before doing anything else 216 : * with it. 217 : * 218 : * Returns: true if the text was valid UTF-8 219 : */ 220 : bool 221 107 : utf8_validate_c_str(const char* str, ssize_t max_len, const char** end) 222 : { 223 : const char* p; 224 : 225 107 : if (max_len < 0) 226 0 : p = fast_validate(str); 227 : else 228 107 : p = fast_validate_len(str, max_len); 229 : 230 107 : if (end) 231 107 : *end = p; 232 : 233 107 : if ((max_len >= 0 && p != str + max_len) || (max_len < 0 && *p != '\0')) 234 0 : return false; 235 : else 236 107 : return true; 237 : } 238 : 239 : bool 240 52989 : utf8_validate(std::string_view str) 241 : { 242 52989 : const char* p = fast_validate_len(str.data(), str.size()); 243 : 244 52989 : return (*p == '\0'); 245 : } 246 : 247 : std::string 248 107 : utf8_make_valid(std::string_view name) 249 : { 250 107 : ssize_t remaining_bytes = name.size(); 251 : ssize_t valid_bytes; 252 107 : const char* remainder = name.data(); 253 : const char* invalid; 254 107 : char* str = NULL; 255 : char* pos; 256 : 257 107 : while (remaining_bytes != 0) { 258 107 : if (utf8_validate_c_str(remainder, remaining_bytes, &invalid)) 259 107 : break; 260 : 261 0 : valid_bytes = invalid - remainder; 262 : 263 0 : if (str == NULL) 264 : // If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size() 265 0 : str = new char[3 * remaining_bytes]; 266 : 267 0 : pos = str; 268 : 269 0 : strncpy(pos, remainder, valid_bytes); 270 0 : pos += valid_bytes; 271 : 272 : /* append U+FFFD REPLACEMENT CHARACTER */ 273 0 : pos[0] = '\357'; 274 0 : pos[1] = '\277'; 275 0 : pos[2] = '\275'; 276 : 277 0 : pos += 3; 278 : 279 0 : remaining_bytes -= valid_bytes + 1; 280 0 : remainder = invalid + 1; 281 : } 282 : 283 107 : if (str == NULL) 284 107 : return std::string(name); 285 : 286 0 : strncpy(pos, remainder, remaining_bytes); 287 0 : pos += remaining_bytes; 288 : 289 0 : std::string answer(str, pos - str); 290 0 : assert(utf8_validate(answer)); 291 : 292 0 : delete[] str; 293 : 294 0 : return answer; 295 0 : } 296 : 297 : } // namespace jami