LCOV - code coverage report
Current view: top level - 11/bits - regex_scanner.tcc (source / functions) Hit Total Coverage
Test: jami-coverage-filtered.info Lines: 108 222 48.6 %
Date: 2025-08-24 09:11:10 Functions: 6 9 66.7 %

          Line data    Source code
       1             : // class template regex -*- C++ -*-
       2             : 
       3             : // Copyright (C) 2013-2021 Free Software Foundation, Inc.
       4             : //
       5             : // This file is part of the GNU ISO C++ Library.  This library is free
       6             : // software; you can redistribute it and/or modify it under the
       7             : // terms of the GNU General Public License as published by the
       8             : // Free Software Foundation; either version 3, or (at your option)
       9             : // any later version.
      10             : 
      11             : // This library is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : 
      16             : // Under Section 7 of GPL version 3, you are granted additional
      17             : // permissions described in the GCC Runtime Library Exception, version
      18             : // 3.1, as published by the Free Software Foundation.
      19             : 
      20             : // You should have received a copy of the GNU General Public License and
      21             : // a copy of the GCC Runtime Library Exception along with this program;
      22             : // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      23             : // <http://www.gnu.org/licenses/>.
      24             : 
      25             : /**
      26             :  *  @file bits/regex_scanner.tcc
      27             :  *  This is an internal header file, included by other library headers.
      28             :  *  Do not attempt to use it directly. @headername{regex}
      29             :  */
      30             : 
      31             : // FIXME make comments doxygen format.
      32             : 
      33             : // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
      34             : // and awk
      35             : // 1) grep is basic except '\n' is treated as '|'
      36             : // 2) egrep is extended except '\n' is treated as '|'
      37             : // 3) awk is extended except special escaping rules, and there's no
      38             : //    back-reference.
      39             : //
      40             : // References:
      41             : //
      42             : // ECMAScript: ECMA-262 15.10
      43             : //
      44             : // basic, extended:
      45             : // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
      46             : //
      47             : // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
      48             : 
      49             : namespace std _GLIBCXX_VISIBILITY(default)
      50             : {
      51             : _GLIBCXX_BEGIN_NAMESPACE_VERSION
      52             : 
      53             : namespace __detail
      54             : {
      55             :   template<typename _CharT>
      56         368 :     _Scanner<_CharT>::
      57             :     _Scanner(const _CharT* __begin, const _CharT* __end,
      58             :              _FlagT __flags, std::locale __loc)
      59             :     : _ScannerBase(__flags),
      60         368 :       _M_current(__begin), _M_end(__end),
      61         368 :       _M_ctype(std::use_facet<_CtypeT>(__loc)),
      62         368 :       _M_eat_escape(_M_is_ecma()
      63             :                     ? &_Scanner::_M_eat_escape_ecma
      64         736 :                     : &_Scanner::_M_eat_escape_posix)
      65         368 :     { _M_advance(); }
      66             : 
      67             :   template<typename _CharT>
      68             :     void
      69       11981 :     _Scanner<_CharT>::
      70             :     _M_advance()
      71             :     {
      72       11981 :       if (_M_current == _M_end)
      73             :         {
      74         736 :           _M_token = _S_token_eof;
      75         736 :           return;
      76             :         }
      77             : 
      78       11245 :       if (_M_state == _S_state_normal)
      79        8134 :         _M_scan_normal();
      80        3111 :       else if (_M_state == _S_state_in_bracket)
      81        2695 :         _M_scan_in_bracket();
      82         416 :       else if (_M_state == _S_state_in_brace)
      83         416 :         _M_scan_in_brace();
      84             :       else
      85             :         {
      86             :           __glibcxx_assert(false);
      87             :         }
      88             :     }
      89             : 
      90             :   // Differences between styles:
      91             :   // 1) "\(", "\)", "\{" in basic. It's not escaping.
      92             :   // 2) "(?:", "(?=", "(?!" in ECMAScript.
      93             :   template<typename _CharT>
      94             :     void
      95        8134 :     _Scanner<_CharT>::
      96             :     _M_scan_normal()
      97             :     {
      98        8134 :       auto __c = *_M_current++;
      99             : 
     100        8134 :       if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
     101             :         {
     102        3554 :           _M_token = _S_token_ord_char;
     103        3554 :           _M_value.assign(1, __c);
     104        3554 :           return;
     105             :         }
     106        4580 :       if (__c == '\\')
     107             :         {
     108         582 :           if (_M_current == _M_end)
     109           0 :             __throw_regex_error(
     110             :               regex_constants::error_escape,
     111             :               "Unexpected end of regex when escaping.");
     112             : 
     113         582 :           if (!_M_is_basic()
     114         582 :               || (*_M_current != '('
     115           0 :                   && *_M_current != ')'
     116           0 :                   && *_M_current != '{'))
     117             :             {
     118         582 :               (this->*_M_eat_escape)();
     119         582 :               return;
     120             :             }
     121           0 :           __c = *_M_current++;
     122             :         }
     123        3998 :       if (__c == '(')
     124             :         {
     125         828 :           if (_M_is_ecma() && *_M_current == '?')
     126             :             {
     127         141 :               if (++_M_current == _M_end)
     128           0 :                 __throw_regex_error(
     129             :                   regex_constants::error_paren,
     130             :                   "Unexpected end of regex when in an open parenthesis.");
     131             : 
     132         141 :               if (*_M_current == ':')
     133             :                 {
     134         141 :                   ++_M_current;
     135         141 :                   _M_token = _S_token_subexpr_no_group_begin;
     136             :                 }
     137           0 :               else if (*_M_current == '=')
     138             :                 {
     139           0 :                   ++_M_current;
     140           0 :                   _M_token = _S_token_subexpr_lookahead_begin;
     141           0 :                   _M_value.assign(1, 'p');
     142             :                 }
     143           0 :               else if (*_M_current == '!')
     144             :                 {
     145           0 :                   ++_M_current;
     146           0 :                   _M_token = _S_token_subexpr_lookahead_begin;
     147           0 :                   _M_value.assign(1, 'n');
     148             :                 }
     149             :               else
     150           0 :                 __throw_regex_error(
     151             :                   regex_constants::error_paren,
     152             :                   "Invalid special open parenthesis.");
     153             :             }
     154         687 :           else if (_M_flags & regex_constants::nosubs)
     155           0 :             _M_token = _S_token_subexpr_no_group_begin;
     156             :           else
     157         687 :             _M_token = _S_token_subexpr_begin;
     158             :         }
     159        3170 :       else if (__c == ')')
     160         828 :         _M_token = _S_token_subexpr_end;
     161        2342 :       else if (__c == '[')
     162             :         {
     163         400 :           _M_state = _S_state_in_bracket;
     164         400 :           _M_at_bracket_start = true;
     165         400 :           if (_M_current != _M_end && *_M_current == '^')
     166             :             {
     167         111 :               _M_token = _S_token_bracket_neg_begin;
     168         111 :               ++_M_current;
     169             :             }
     170             :           else
     171         289 :             _M_token = _S_token_bracket_begin;
     172             :         }
     173        1942 :       else if (__c == '{')
     174             :         {
     175         104 :           _M_state = _S_state_in_brace;
     176         104 :           _M_token = _S_token_interval_begin;
     177             :         }
     178        1838 :       else if (__builtin_expect(__c == _CharT(0), false))
     179             :         {
     180           0 :           if (!_M_is_ecma())
     181             :             {
     182           0 :               __throw_regex_error(regex_constants::_S_null,
     183             :                   "Unexpected null character in regular expression");
     184             :             }
     185           0 :           _M_token = _S_token_ord_char;
     186           0 :           _M_value.assign(1, __c);
     187             :         }
     188        1838 :       else if (__c != ']' && __c != '}')
     189             :         {
     190        1838 :           auto __it = _M_token_tbl;
     191        1838 :           auto __narrowc = _M_ctype.narrow(__c, '\0');
     192        8725 :           for (; __it->first != '\0'; ++__it)
     193        8725 :             if (__it->first == __narrowc)
     194             :               {
     195        1838 :                 _M_token = __it->second;
     196        1838 :                 return;
     197             :               }
     198             :           __glibcxx_assert(false);
     199           0 :         }
     200             :       else
     201             :         {
     202           0 :           _M_token = _S_token_ord_char;
     203           0 :           _M_value.assign(1, __c);
     204             :         }
     205             :     }
     206             : 
     207             :   // Differences between styles:
     208             :   // 1) different semantics of "[]" and "[^]".
     209             :   // 2) Escaping in bracket expr.
     210             :   template<typename _CharT>
     211             :     void
     212        2695 :     _Scanner<_CharT>::
     213             :     _M_scan_in_bracket()
     214             :     {
     215        2695 :       if (_M_current == _M_end)
     216           0 :         __throw_regex_error(
     217             :           regex_constants::error_brack,
     218             :           "Unexpected end of regex when in bracket expression.");
     219             : 
     220        2695 :       auto __c = *_M_current++;
     221             : 
     222        2695 :       if (__c == '-')
     223         381 :         _M_token = _S_token_bracket_dash;
     224        2314 :       else if (__c == '[')
     225             :         {
     226           0 :           if (_M_current == _M_end)
     227           0 :             __throw_regex_error(regex_constants::error_brack,
     228             :                                 "Unexpected character class open bracket.");
     229             : 
     230           0 :           if (*_M_current == '.')
     231             :             {
     232           0 :               _M_token = _S_token_collsymbol;
     233           0 :               _M_eat_class(*_M_current++);
     234             :             }
     235           0 :           else if (*_M_current == ':')
     236             :             {
     237           0 :               _M_token = _S_token_char_class_name;
     238           0 :               _M_eat_class(*_M_current++);
     239             :             }
     240           0 :           else if (*_M_current == '=')
     241             :             {
     242           0 :               _M_token = _S_token_equiv_class_name;
     243           0 :               _M_eat_class(*_M_current++);
     244             :             }
     245             :           else
     246             :             {
     247           0 :               _M_token = _S_token_ord_char;
     248           0 :               _M_value.assign(1, __c);
     249             :             }
     250             :         }
     251             :       // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
     252             :       // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
     253             :       // `*/empty_range.cc`.
     254        2314 :       else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
     255             :         {
     256         400 :           _M_token = _S_token_bracket_end;
     257         400 :           _M_state = _S_state_normal;
     258             :         }
     259             :       // ECMAScript and awk permits escaping in bracket.
     260        1914 :       else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
     261         339 :         (this->*_M_eat_escape)();
     262             :       else
     263             :         {
     264        1575 :           _M_token = _S_token_ord_char;
     265        1575 :           _M_value.assign(1, __c);
     266             :         }
     267        2695 :       _M_at_bracket_start = false;
     268        2695 :     }
     269             : 
     270             :   // Differences between styles:
     271             :   // 1) "\}" in basic style.
     272             :   template<typename _CharT>
     273             :     void
     274         416 :     _Scanner<_CharT>::
     275             :     _M_scan_in_brace()
     276             :     {
     277         416 :       if (_M_current == _M_end)
     278           0 :         __throw_regex_error(
     279             :           regex_constants::error_brace,
     280             :           "Unexpected end of regex when in brace expression.");
     281             : 
     282         416 :       auto __c = *_M_current++;
     283             : 
     284         416 :       if (_M_ctype.is(_CtypeT::digit, __c))
     285             :         {
     286         208 :           _M_token = _S_token_dup_count;
     287         208 :           _M_value.assign(1, __c);
     288         208 :           while (_M_current != _M_end
     289         290 :                  && _M_ctype.is(_CtypeT::digit, *_M_current))
     290          82 :             _M_value += *_M_current++;
     291             :         }
     292         208 :       else if (__c == ',')
     293         104 :         _M_token = _S_token_comma;
     294             :       // basic use \}.
     295         104 :       else if (_M_is_basic())
     296             :         {
     297           0 :           if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
     298             :             {
     299           0 :               _M_state = _S_state_normal;
     300           0 :               _M_token = _S_token_interval_end;
     301           0 :               ++_M_current;
     302             :             }
     303             :           else
     304           0 :             __throw_regex_error(regex_constants::error_badbrace,
     305             :                                 "Unexpected character in brace expression.");
     306             :         }
     307         104 :       else if (__c == '}')
     308             :         {
     309         104 :           _M_state = _S_state_normal;
     310         104 :           _M_token = _S_token_interval_end;
     311             :         }
     312             :       else
     313           0 :         __throw_regex_error(regex_constants::error_badbrace,
     314             :                             "Unexpected character in brace expression.");
     315         416 :     }
     316             : 
     317             :   template<typename _CharT>
     318             :     void
     319         921 :     _Scanner<_CharT>::
     320             :     _M_eat_escape_ecma()
     321             :     {
     322         921 :       if (_M_current == _M_end)
     323           0 :         __throw_regex_error(regex_constants::error_escape,
     324             :                             "Unexpected end of regex when escaping.");
     325             : 
     326         921 :       auto __c = *_M_current++;
     327         921 :       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
     328             : 
     329         921 :       if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
     330             :         {
     331           0 :           _M_token = _S_token_ord_char;
     332           0 :           _M_value.assign(1, *__pos);
     333             :         }
     334         921 :       else if (__c == 'b')
     335             :         {
     336           0 :           _M_token = _S_token_word_bound;
     337           0 :           _M_value.assign(1, 'p');
     338             :         }
     339         921 :       else if (__c == 'B')
     340             :         {
     341           0 :           _M_token = _S_token_word_bound;
     342           0 :           _M_value.assign(1, 'n');
     343             :         }
     344             :       // N3376 28.13
     345         921 :       else if (__c == 'd'
     346         880 :                || __c == 'D'
     347         880 :                || __c == 's'
     348         781 :                || __c == 'S'
     349         781 :                || __c == 'w'
     350         666 :                || __c == 'W')
     351             :         {
     352         255 :           _M_token = _S_token_quoted_class;
     353         255 :           _M_value.assign(1, __c);
     354             :         }
     355         666 :       else if (__c == 'c')
     356             :         {
     357           0 :           if (_M_current == _M_end)
     358           0 :             __throw_regex_error(
     359             :               regex_constants::error_escape,
     360             :               "Unexpected end of regex when reading control code.");
     361           0 :           _M_token = _S_token_ord_char;
     362           0 :           _M_value.assign(1, *_M_current++);
     363             :         }
     364         666 :       else if (__c == 'x' || __c == 'u')
     365             :         {
     366           0 :           _M_value.erase();
     367           0 :           for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
     368             :             {
     369           0 :               if (_M_current == _M_end
     370           0 :                   || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
     371           0 :                 __throw_regex_error(
     372             :                   regex_constants::error_escape,
     373             :                   "Unexpected end of regex when ascii character.");
     374           0 :               _M_value += *_M_current++;
     375             :             }
     376           0 :           _M_token = _S_token_hex_num;
     377           0 :         }
     378             :       // ECMAScript recognizes multi-digit back-references.
     379         666 :       else if (_M_ctype.is(_CtypeT::digit, __c))
     380             :         {
     381           0 :           _M_value.assign(1, __c);
     382           0 :           while (_M_current != _M_end
     383           0 :                  && _M_ctype.is(_CtypeT::digit, *_M_current))
     384           0 :             _M_value += *_M_current++;
     385           0 :           _M_token = _S_token_backref;
     386             :         }
     387             :       else
     388             :         {
     389         666 :           _M_token = _S_token_ord_char;
     390         666 :           _M_value.assign(1, __c);
     391             :         }
     392         921 :     }
     393             : 
     394             :   // Differences between styles:
     395             :   // 1) Extended doesn't support backref, but basic does.
     396             :   template<typename _CharT>
     397             :     void
     398           0 :     _Scanner<_CharT>::
     399             :     _M_eat_escape_posix()
     400             :     {
     401           0 :       if (_M_current == _M_end)
     402           0 :         __throw_regex_error(regex_constants::error_escape,
     403             :                             "Unexpected end of regex when escaping.");
     404             : 
     405           0 :       auto __c = *_M_current;
     406           0 :       auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
     407             : 
     408           0 :       if (__pos != nullptr && *__pos != '\0')
     409             :         {
     410           0 :           _M_token = _S_token_ord_char;
     411           0 :           _M_value.assign(1, __c);
     412             :         }
     413             :       // We MUST judge awk before handling backrefs. There's no backref in awk.
     414           0 :       else if (_M_is_awk())
     415             :         {
     416           0 :           _M_eat_escape_awk();
     417           0 :           return;
     418             :         }
     419           0 :       else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
     420             :         {
     421           0 :           _M_token = _S_token_backref;
     422           0 :           _M_value.assign(1, __c);
     423             :         }
     424             :       else
     425             :         {
     426             : #ifdef __STRICT_ANSI__
     427             :           // POSIX says it is undefined to escape ordinary characters
     428             :           __throw_regex_error(regex_constants::error_escape,
     429             :                               "Unexpected escape character.");
     430             : #else
     431           0 :           _M_token = _S_token_ord_char;
     432           0 :           _M_value.assign(1, __c);
     433             : #endif
     434             :         }
     435           0 :       ++_M_current;
     436             :     }
     437             : 
     438             :   template<typename _CharT>
     439             :     void
     440           0 :     _Scanner<_CharT>::
     441             :     _M_eat_escape_awk()
     442             :     {
     443           0 :       auto __c = *_M_current++;
     444           0 :       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
     445             : 
     446           0 :       if (__pos != nullptr)
     447             :         {
     448           0 :           _M_token = _S_token_ord_char;
     449           0 :           _M_value.assign(1, *__pos);
     450             :         }
     451             :       // \ddd for oct representation
     452           0 :       else if (_M_ctype.is(_CtypeT::digit, __c)
     453           0 :                && __c != '8'
     454           0 :                && __c != '9')
     455             :         {
     456           0 :           _M_value.assign(1,  __c);
     457           0 :           for (int __i = 0;
     458             :                __i < 2
     459           0 :                && _M_current != _M_end
     460           0 :                && _M_ctype.is(_CtypeT::digit, *_M_current)
     461           0 :                && *_M_current != '8'
     462           0 :                && *_M_current != '9';
     463             :                __i++)
     464           0 :             _M_value += *_M_current++;
     465           0 :           _M_token = _S_token_oct_num;
     466           0 :           return;
     467             :         }
     468             :       else
     469           0 :         __throw_regex_error(regex_constants::error_escape,
     470             :                             "Unexpected escape character.");
     471             :     }
     472             : 
     473             :   // Eats a character class or throws an exception.
     474             :   // __ch could be ':', '.' or '=', _M_current is the char after ']' when
     475             :   // returning.
     476             :   template<typename _CharT>
     477             :     void
     478           0 :     _Scanner<_CharT>::
     479             :     _M_eat_class(char __ch)
     480             :     {
     481           0 :       for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
     482           0 :         _M_value += *_M_current++;
     483           0 :       if (_M_current == _M_end
     484           0 :           || *_M_current++ != __ch
     485           0 :           || _M_current == _M_end // skip __ch
     486           0 :           || *_M_current++ != ']') // skip ']'
     487             :         {
     488           0 :           if (__ch == ':')
     489           0 :             __throw_regex_error(regex_constants::error_ctype,
     490             :                                 "Unexpected end of character class.");
     491             :           else
     492           0 :             __throw_regex_error(regex_constants::error_collate,
     493             :                                 "Unexpected end of character class.");
     494             :         }
     495           0 :     }
     496             : 
     497             : #ifdef _GLIBCXX_DEBUG
     498             :   template<typename _CharT>
     499             :     std::ostream&
     500             :     _Scanner<_CharT>::
     501             :     _M_print(std::ostream& ostr)
     502             :     {
     503             :       switch (_M_token)
     504             :       {
     505             :       case _S_token_anychar:
     506             :         ostr << "any-character\n";
     507             :         break;
     508             :       case _S_token_backref:
     509             :         ostr << "backref\n";
     510             :         break;
     511             :       case _S_token_bracket_begin:
     512             :         ostr << "bracket-begin\n";
     513             :         break;
     514             :       case _S_token_bracket_neg_begin:
     515             :         ostr << "bracket-neg-begin\n";
     516             :         break;
     517             :       case _S_token_bracket_end:
     518             :         ostr << "bracket-end\n";
     519             :         break;
     520             :       case _S_token_char_class_name:
     521             :         ostr << "char-class-name \"" << _M_value << "\"\n";
     522             :         break;
     523             :       case _S_token_closure0:
     524             :         ostr << "closure0\n";
     525             :         break;
     526             :       case _S_token_closure1:
     527             :         ostr << "closure1\n";
     528             :         break;
     529             :       case _S_token_collsymbol:
     530             :         ostr << "collsymbol \"" << _M_value << "\"\n";
     531             :         break;
     532             :       case _S_token_comma:
     533             :         ostr << "comma\n";
     534             :         break;
     535             :       case _S_token_dup_count:
     536             :         ostr << "dup count: " << _M_value << "\n";
     537             :         break;
     538             :       case _S_token_eof:
     539             :         ostr << "EOF\n";
     540             :         break;
     541             :       case _S_token_equiv_class_name:
     542             :         ostr << "equiv-class-name \"" << _M_value << "\"\n";
     543             :         break;
     544             :       case _S_token_interval_begin:
     545             :         ostr << "interval begin\n";
     546             :         break;
     547             :       case _S_token_interval_end:
     548             :         ostr << "interval end\n";
     549             :         break;
     550             :       case _S_token_line_begin:
     551             :         ostr << "line begin\n";
     552             :         break;
     553             :       case _S_token_line_end:
     554             :         ostr << "line end\n";
     555             :         break;
     556             :       case _S_token_opt:
     557             :         ostr << "opt\n";
     558             :         break;
     559             :       case _S_token_or:
     560             :         ostr << "or\n";
     561             :         break;
     562             :       case _S_token_ord_char:
     563             :         ostr << "ordinary character: \"" << _M_value << "\"\n";
     564             :         break;
     565             :       case _S_token_subexpr_begin:
     566             :         ostr << "subexpr begin\n";
     567             :         break;
     568             :       case _S_token_subexpr_no_group_begin:
     569             :         ostr << "no grouping subexpr begin\n";
     570             :         break;
     571             :       case _S_token_subexpr_lookahead_begin:
     572             :         ostr << "lookahead subexpr begin\n";
     573             :         break;
     574             :       case _S_token_subexpr_end:
     575             :         ostr << "subexpr end\n";
     576             :         break;
     577             :       case _S_token_unknown:
     578             :         ostr << "-- unknown token --\n";
     579             :         break;
     580             :       case _S_token_oct_num:
     581             :         ostr << "oct number " << _M_value << "\n";
     582             :         break;
     583             :       case _S_token_hex_num:
     584             :         ostr << "hex number " << _M_value << "\n";
     585             :         break;
     586             :       case _S_token_quoted_class:
     587             :         ostr << "quoted class " << "\\" << _M_value << "\n";
     588             :         break;
     589             :       default:
     590             :         _GLIBCXX_DEBUG_ASSERT(false);
     591             :       }
     592             :       return ostr;
     593             :     }
     594             : #endif
     595             : 
     596             : } // namespace __detail
     597             : _GLIBCXX_END_NAMESPACE_VERSION
     598             : } // namespace

Generated by: LCOV version 1.14