Line data Source code
1 : // class template regex -*- C++ -*-
2 :
3 : // Copyright (C) 2013-2021 Free Software Foundation, Inc.
4 : //
5 : // This file is part of the GNU ISO C++ Library. This library is free
6 : // software; you can redistribute it and/or modify it under the
7 : // terms of the GNU General Public License as published by the
8 : // Free Software Foundation; either version 3, or (at your option)
9 : // any later version.
10 :
11 : // This library is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 :
16 : // Under Section 7 of GPL version 3, you are granted additional
17 : // permissions described in the GCC Runtime Library Exception, version
18 : // 3.1, as published by the Free Software Foundation.
19 :
20 : // You should have received a copy of the GNU General Public License and
21 : // a copy of the GCC Runtime Library Exception along with this program;
22 : // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 : // <http://www.gnu.org/licenses/>.
24 :
25 : /**
26 : * @file bits/regex_scanner.h
27 : * This is an internal header file, included by other library headers.
28 : * Do not attempt to use it directly. @headername{regex}
29 : */
30 :
31 : namespace std _GLIBCXX_VISIBILITY(default)
32 : {
33 : _GLIBCXX_BEGIN_NAMESPACE_VERSION
34 :
35 : namespace __detail
36 : {
37 : /**
38 : * @addtogroup regex-detail
39 : * @{
40 : */
41 :
42 : struct _ScannerBase
43 : {
44 : public:
45 : /// Token types returned from the scanner.
46 : enum _TokenT : unsigned
47 : {
48 : _S_token_anychar,
49 : _S_token_ord_char,
50 : _S_token_oct_num,
51 : _S_token_hex_num,
52 : _S_token_backref,
53 : _S_token_subexpr_begin,
54 : _S_token_subexpr_no_group_begin,
55 : _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
56 : _S_token_subexpr_end,
57 : _S_token_bracket_begin,
58 : _S_token_bracket_neg_begin,
59 : _S_token_bracket_end,
60 : _S_token_interval_begin,
61 : _S_token_interval_end,
62 : _S_token_quoted_class,
63 : _S_token_char_class_name,
64 : _S_token_collsymbol,
65 : _S_token_equiv_class_name,
66 : _S_token_opt,
67 : _S_token_or,
68 : _S_token_closure0,
69 : _S_token_closure1,
70 : _S_token_line_begin,
71 : _S_token_line_end,
72 : _S_token_word_bound, // neg if _M_value[0] == 'n'
73 : _S_token_comma,
74 : _S_token_dup_count,
75 : _S_token_eof,
76 : _S_token_bracket_dash,
77 : _S_token_unknown = -1u
78 : };
79 :
80 : protected:
81 : typedef regex_constants::syntax_option_type _FlagT;
82 :
83 : enum _StateT
84 : {
85 : _S_state_normal,
86 : _S_state_in_brace,
87 : _S_state_in_bracket,
88 : };
89 :
90 : protected:
91 368 : _ScannerBase(_FlagT __flags)
92 368 : : _M_state(_S_state_normal),
93 368 : _M_flags(__flags),
94 736 : _M_escape_tbl(_M_is_ecma()
95 368 : ? _M_ecma_escape_tbl
96 : : _M_awk_escape_tbl),
97 368 : _M_spec_char(_M_is_ecma()
98 368 : ? _M_ecma_spec_char
99 0 : : _M_flags & regex_constants::basic
100 0 : ? _M_basic_spec_char
101 0 : : _M_flags & regex_constants::extended
102 0 : ? _M_extended_spec_char
103 0 : : _M_flags & regex_constants::grep
104 0 : ? ".[\\*^$\n"
105 0 : : _M_flags & regex_constants::egrep
106 0 : ? ".[\\()*+?{|^$\n"
107 0 : : _M_flags & regex_constants::awk
108 0 : ? _M_extended_spec_char
109 : : nullptr),
110 736 : _M_at_bracket_start(false)
111 368 : { __glibcxx_assert(_M_spec_char); }
112 :
113 : protected:
114 : const char*
115 921 : _M_find_escape(char __c)
116 : {
117 921 : auto __it = _M_escape_tbl;
118 7368 : for (; __it->first != '\0'; ++__it)
119 6447 : if (__it->first == __c)
120 0 : return &__it->second;
121 921 : return nullptr;
122 : }
123 :
124 : bool
125 2671 : _M_is_ecma() const
126 2671 : { return _M_flags & regex_constants::ECMAScript; }
127 :
128 : bool
129 686 : _M_is_basic() const
130 686 : { return _M_flags & (regex_constants::basic | regex_constants::grep); }
131 :
132 : bool
133 : _M_is_extended() const
134 : {
135 : return _M_flags & (regex_constants::extended
136 : | regex_constants::egrep
137 : | regex_constants::awk);
138 : }
139 :
140 : bool
141 : _M_is_grep() const
142 : { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
143 :
144 : bool
145 0 : _M_is_awk() const
146 0 : { return _M_flags & regex_constants::awk; }
147 :
148 : protected:
149 : // TODO: Make them static in the next abi change.
150 : const std::pair<char, _TokenT> _M_token_tbl[9] =
151 : {
152 : {'^', _S_token_line_begin},
153 : {'$', _S_token_line_end},
154 : {'.', _S_token_anychar},
155 : {'*', _S_token_closure0},
156 : {'+', _S_token_closure1},
157 : {'?', _S_token_opt},
158 : {'|', _S_token_or},
159 : {'\n', _S_token_or}, // grep and egrep
160 : {'\0', _S_token_or},
161 : };
162 : const std::pair<char, char> _M_ecma_escape_tbl[8] =
163 : {
164 : {'0', '\0'},
165 : {'b', '\b'},
166 : {'f', '\f'},
167 : {'n', '\n'},
168 : {'r', '\r'},
169 : {'t', '\t'},
170 : {'v', '\v'},
171 : {'\0', '\0'},
172 : };
173 : const std::pair<char, char> _M_awk_escape_tbl[11] =
174 : {
175 : {'"', '"'},
176 : {'/', '/'},
177 : {'\\', '\\'},
178 : {'a', '\a'},
179 : {'b', '\b'},
180 : {'f', '\f'},
181 : {'n', '\n'},
182 : {'r', '\r'},
183 : {'t', '\t'},
184 : {'v', '\v'},
185 : {'\0', '\0'},
186 : };
187 : const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|";
188 : const char* _M_basic_spec_char = ".[\\*^$";
189 : const char* _M_extended_spec_char = ".[\\()*+?{|^$";
190 :
191 : _StateT _M_state;
192 : _FlagT _M_flags;
193 : _TokenT _M_token;
194 : const std::pair<char, char>* _M_escape_tbl;
195 : const char* _M_spec_char;
196 : bool _M_at_bracket_start;
197 : };
198 :
199 : /**
200 : * @brief Scans an input range for regex tokens.
201 : *
202 : * The %_Scanner class interprets the regular expression pattern in
203 : * the input range passed to its constructor as a sequence of parse
204 : * tokens passed to the regular expression compiler. The sequence
205 : * of tokens provided depends on the flag settings passed to the
206 : * constructor: different regular expression grammars will interpret
207 : * the same input pattern in syntactically different ways.
208 : */
209 : template<typename _CharT>
210 : class _Scanner
211 : : public _ScannerBase
212 : {
213 : public:
214 : typedef std::basic_string<_CharT> _StringT;
215 : typedef regex_constants::syntax_option_type _FlagT;
216 : typedef const std::ctype<_CharT> _CtypeT;
217 :
218 : _Scanner(const _CharT* __begin, const _CharT* __end,
219 : _FlagT __flags, std::locale __loc);
220 :
221 : void
222 : _M_advance();
223 :
224 : _TokenT
225 117294 : _M_get_token() const noexcept
226 117294 : { return _M_token; }
227 :
228 : const _StringT&
229 11613 : _M_get_value() const noexcept
230 11613 : { return _M_value; }
231 :
232 : #ifdef _GLIBCXX_DEBUG
233 : std::ostream&
234 : _M_print(std::ostream&);
235 : #endif
236 :
237 : private:
238 : void
239 : _M_scan_normal();
240 :
241 : void
242 : _M_scan_in_bracket();
243 :
244 : void
245 : _M_scan_in_brace();
246 :
247 : void
248 : _M_eat_escape_ecma();
249 :
250 : void
251 : _M_eat_escape_posix();
252 :
253 : void
254 : _M_eat_escape_awk();
255 :
256 : void
257 : _M_eat_class(char);
258 :
259 : const _CharT* _M_current;
260 : const _CharT* _M_end;
261 : _CtypeT& _M_ctype;
262 : _StringT _M_value;
263 : void (_Scanner::* _M_eat_escape)();
264 : };
265 :
266 : ///@} regex-detail
267 : } // namespace __detail
268 : _GLIBCXX_END_NAMESPACE_VERSION
269 : } // namespace std
270 :
271 : #include <bits/regex_scanner.tcc>
|