LCOV - code coverage report
Current view: top level - src/connectivity - utf8_utils.cpp (source / functions) Hit Total Coverage
Test: jami-coverage-filtered.info Lines: 26 119 21.8 %
Date: 2024-05-10 07:56:25 Functions: 4 5 80.0 %

          Line data    Source code
       1             : /*
       2             :  *  Copyright (C) 1999 Tom Tromey
       3             :  *  Copyright (C) 2000 Red Hat, Inc.
       4             :  *  Copyright (C) 2004-2024 Savoir-faire Linux Inc.
       5             :  *
       6             :  *  Author: Pascal Potvin <pascal.potvin@extenway.com>
       7             :  *
       8             :  *  This program is free software; you can redistribute it and/or modify
       9             :  *  it under the terms of the GNU General Public License as published by
      10             :  *  the Free Software Foundation; either version 3 of the License, or
      11             :  *  (at your option) any later version.
      12             :  *
      13             :  *  This program is distributed in the hope that it will be useful,
      14             :  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
      15             :  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      16             :  *  GNU General Public License for more details.
      17             :  *
      18             :  *  You should have received a copy of the GNU General Public License
      19             :  *  along with this program; if not, write to the Free Software
      20             :  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA.
      21             :  */
      22             : 
      23             : #include <cstring>
      24             : #include <cassert>
      25             : #include "connectivity/utf8_utils.h"
      26             : 
      27             : #if defined(_MSC_VER)
      28             : #include <BaseTsd.h>
      29             : using ssize_t = SSIZE_T;
      30             : #endif
      31             : 
      32             : /*
      33             :  * The LIKELY and UNLIKELY macros let the programmer give hints to
      34             :  * the compiler about the expected result of an expression. Some compilers
      35             :  * can use this information for optimizations.
      36             :  */
      37             : #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
      38             : #define LIKELY(expr)   (__builtin_expect(expr, 1))
      39             : #define UNLIKELY(expr) (__builtin_expect(expr, 0))
      40             : #else
      41             : #define LIKELY(expr)   (expr)
      42             : #define UNLIKELY(expr) (expr)
      43             : #endif
      44             : 
      45             : /*
      46             :  * Check whether a Unicode (5.2) char is in a valid range.
      47             :  *
      48             :  * The first check comes from the Unicode guarantee to never encode
      49             :  * a point above 0x0010ffff, since UTF-16 couldn't represent it.
      50             :  *
      51             :  * The second check covers surrogate pairs (category Cs).
      52             :  *
      53             :  * @param Char the character
      54             :  */
      55             : #define UNICODE_VALID(Char) ((Char) < 0x110000 && (((Char) &0xFFFFF800) != 0xD800))
      56             : 
      57             : #define CONTINUATION_CHAR \
      58             :     if ((*(unsigned char*) p & 0xc0) != 0x80) /* 10xxxxxx */ \
      59             :         goto error; \
      60             :     val <<= 6; \
      61             :     val |= (*(unsigned char*) p) & 0x3f;
      62             : 
      63             : namespace jami {
      64             : 
      65             : bool utf8_validate_c_str(const char* str, ssize_t max_len, const char** end);
      66             : 
      67             : static const char*
      68           0 : fast_validate(const char* str)
      69             : {
      70           0 :     char32_t val = 0;
      71           0 :     char32_t min = 0;
      72             :     const char* p;
      73             : 
      74           0 :     for (p = str; *p; p++) {
      75           0 :         if (*(unsigned char*) p < 128)
      76             :             /* done */;
      77             :         else {
      78             :             const char* last;
      79             : 
      80           0 :             last = p;
      81             : 
      82           0 :             if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */
      83           0 :                 if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0))
      84           0 :                     goto error;
      85             : 
      86           0 :                 p++;
      87             : 
      88           0 :                 if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */
      89           0 :                     goto error;
      90             :             } else {
      91           0 :                 if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */
      92           0 :                     min = (1 << 11);
      93           0 :                     val = *(unsigned char*) p & 0x0f;
      94           0 :                     goto TWO_REMAINING;
      95           0 :                 } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */
      96           0 :                     min = (1 << 16);
      97           0 :                     val = *(unsigned char*) p & 0x07;
      98             :                 } else
      99           0 :                     goto error;
     100             : 
     101           0 :                 p++;
     102           0 :                 CONTINUATION_CHAR;
     103           0 :             TWO_REMAINING:
     104           0 :                 p++;
     105           0 :                 CONTINUATION_CHAR;
     106           0 :                 p++;
     107           0 :                 CONTINUATION_CHAR;
     108             : 
     109           0 :                 if (UNLIKELY(val < min))
     110           0 :                     goto error;
     111             : 
     112           0 :                 if (UNLIKELY(!UNICODE_VALID(val)))
     113           0 :                     goto error;
     114             :             }
     115             : 
     116           0 :             continue;
     117             : 
     118           0 :         error:
     119           0 :             return last;
     120           0 :         }
     121             :     }
     122             : 
     123           0 :     return p;
     124             : }
     125             : 
     126             : static const char*
     127       53096 : fast_validate_len(const char* str, ssize_t max_len)
     128             : {
     129       53096 :     char32_t val = 0;
     130       53096 :     char32_t min = 0;
     131             :     const char* p;
     132             : 
     133       53096 :     assert(max_len >= 0);
     134             : 
     135     5749672 :     for (p = str; ((p - str) < max_len) && *p; p++) {
     136     5696576 :         if (*(unsigned char*) p < 128)
     137             :             /* done */;
     138             :         else {
     139             :             const char* last;
     140             : 
     141           0 :             last = p;
     142             : 
     143           0 :             if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */
     144           0 :                 if (UNLIKELY(max_len - (p - str) < 2))
     145           0 :                     goto error;
     146             : 
     147           0 :                 if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0))
     148           0 :                     goto error;
     149             : 
     150           0 :                 p++;
     151             : 
     152           0 :                 if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */
     153           0 :                     goto error;
     154             :             } else {
     155           0 :                 if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */
     156           0 :                     if (UNLIKELY(max_len - (p - str) < 3))
     157           0 :                         goto error;
     158             : 
     159           0 :                     min = (1 << 11);
     160           0 :                     val = *(unsigned char*) p & 0x0f;
     161           0 :                     goto TWO_REMAINING;
     162           0 :                 } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */
     163           0 :                     if (UNLIKELY(max_len - (p - str) < 4))
     164           0 :                         goto error;
     165             : 
     166           0 :                     min = (1 << 16);
     167           0 :                     val = *(unsigned char*) p & 0x07;
     168             :                 } else
     169           0 :                     goto error;
     170             : 
     171           0 :                 p++;
     172           0 :                 CONTINUATION_CHAR;
     173           0 :             TWO_REMAINING:
     174           0 :                 p++;
     175           0 :                 CONTINUATION_CHAR;
     176           0 :                 p++;
     177           0 :                 CONTINUATION_CHAR;
     178             : 
     179           0 :                 if (UNLIKELY(val < min))
     180           0 :                     goto error;
     181             : 
     182           0 :                 if (UNLIKELY(!UNICODE_VALID(val)))
     183           0 :                     goto error;
     184             :             }
     185             : 
     186           0 :             continue;
     187             : 
     188           0 :         error:
     189           0 :             return last;
     190           0 :         }
     191             :     }
     192             : 
     193       53096 :     return p;
     194             : }
     195             : 
     196             : /**
     197             :  * utf8_validate_c_str:
     198             :  * @str: a pointer to character data
     199             :  * @max_len: max bytes to validate, or -1 to go until NULL
     200             :  * @end: return location for end of valid data
     201             :  *
     202             :  * Validates UTF-8 encoded text. @str is the text to validate;
     203             :  * if @str is nul-terminated, then @max_len can be -1, otherwise
     204             :  * @max_len should be the number of bytes to validate.
     205             :  * If @end is non-%NULL, then the end of the valid range
     206             :  * will be stored there (i.e. the start of the first invalid
     207             :  * character if some bytes were invalid, or the end of the text
     208             :  * being validated otherwise).
     209             :  *
     210             :  * Note that utf8_validate() returns %false if @max_len is
     211             :  * positive and any of the @max_len bytes are nul.
     212             :  *
     213             :  * Returns true if all of @str was valid. Dbus requires valid UTF-8 as input;
     214             :  * sip packets should also be encoded in utf8; so data read from a file or the
     215             :  * network should be checked with utf8_validate() before doing anything else
     216             :  * with it.
     217             :  *
     218             :  * Returns: true if the text was valid UTF-8
     219             :  */
     220             : bool
     221         107 : utf8_validate_c_str(const char* str, ssize_t max_len, const char** end)
     222             : {
     223             :     const char* p;
     224             : 
     225         107 :     if (max_len < 0)
     226           0 :         p = fast_validate(str);
     227             :     else
     228         107 :         p = fast_validate_len(str, max_len);
     229             : 
     230         107 :     if (end)
     231         107 :         *end = p;
     232             : 
     233         107 :     if ((max_len >= 0 && p != str + max_len) || (max_len < 0 && *p != '\0'))
     234           0 :         return false;
     235             :     else
     236         107 :         return true;
     237             : }
     238             : 
     239             : bool
     240       52989 : utf8_validate(std::string_view str)
     241             : {
     242       52989 :     const char* p = fast_validate_len(str.data(), str.size());
     243             : 
     244       52989 :     return (*p == '\0');
     245             : }
     246             : 
     247             : std::string
     248         107 : utf8_make_valid(std::string_view name)
     249             : {
     250         107 :     ssize_t remaining_bytes = name.size();
     251             :     ssize_t valid_bytes;
     252         107 :     const char* remainder = name.data();
     253             :     const char* invalid;
     254         107 :     char* str = NULL;
     255             :     char* pos;
     256             : 
     257         107 :     while (remaining_bytes != 0) {
     258         107 :         if (utf8_validate_c_str(remainder, remaining_bytes, &invalid))
     259         107 :             break;
     260             : 
     261           0 :         valid_bytes = invalid - remainder;
     262             : 
     263           0 :         if (str == NULL)
     264             :             // If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size()
     265           0 :             str = new char[3 * remaining_bytes];
     266             : 
     267           0 :         pos = str;
     268             : 
     269           0 :         strncpy(pos, remainder, valid_bytes);
     270           0 :         pos += valid_bytes;
     271             : 
     272             :         /* append U+FFFD REPLACEMENT CHARACTER */
     273           0 :         pos[0] = '\357';
     274           0 :         pos[1] = '\277';
     275           0 :         pos[2] = '\275';
     276             : 
     277           0 :         pos += 3;
     278             : 
     279           0 :         remaining_bytes -= valid_bytes + 1;
     280           0 :         remainder = invalid + 1;
     281             :     }
     282             : 
     283         107 :     if (str == NULL)
     284         107 :         return std::string(name);
     285             : 
     286           0 :     strncpy(pos, remainder, remaining_bytes);
     287           0 :     pos += remaining_bytes;
     288             : 
     289           0 :     std::string answer(str, pos - str);
     290           0 :     assert(utf8_validate(answer));
     291             : 
     292           0 :     delete[] str;
     293             : 
     294           0 :     return answer;
     295           0 : }
     296             : 
     297             : } // namespace jami

Generated by: LCOV version 1.14