Line data Source code
1 : // class template regex -*- C++ -*-
2 :
3 : // Copyright (C) 2013-2021 Free Software Foundation, Inc.
4 : //
5 : // This file is part of the GNU ISO C++ Library. This library is free
6 : // software; you can redistribute it and/or modify it under the
7 : // terms of the GNU General Public License as published by the
8 : // Free Software Foundation; either version 3, or (at your option)
9 : // any later version.
10 :
11 : // This library is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 :
16 : // Under Section 7 of GPL version 3, you are granted additional
17 : // permissions described in the GCC Runtime Library Exception, version
18 : // 3.1, as published by the Free Software Foundation.
19 :
20 : // You should have received a copy of the GNU General Public License and
21 : // a copy of the GCC Runtime Library Exception along with this program;
22 : // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 : // <http://www.gnu.org/licenses/>.
24 :
25 : /**
26 : * @file bits/regex_automaton.h
27 : * This is an internal header file, included by other library headers.
28 : * Do not attempt to use it directly. @headername{regex}
29 : */
30 :
31 : // This macro defines the maximal state number a NFA can have.
32 : #ifndef _GLIBCXX_REGEX_STATE_LIMIT
33 : #define _GLIBCXX_REGEX_STATE_LIMIT 100000
34 : #endif
35 :
36 : namespace std _GLIBCXX_VISIBILITY(default)
37 : {
38 : _GLIBCXX_BEGIN_NAMESPACE_VERSION
39 :
40 : namespace __detail
41 : {
42 : /**
43 : * @defgroup regex-detail Base and Implementation Classes
44 : * @ingroup regex
45 : * @{
46 : */
47 :
48 : typedef long _StateIdT;
49 : static const _StateIdT _S_invalid_state_id = -1;
50 :
51 : template<typename _CharT>
52 : using _Matcher = std::function<bool (_CharT)>;
53 :
54 : /// Operation codes that define the type of transitions within the base NFA
55 : /// that represents the regular expression.
56 : enum _Opcode : int
57 : {
58 : _S_opcode_unknown,
59 : _S_opcode_alternative,
60 : _S_opcode_repeat,
61 : _S_opcode_backref,
62 : _S_opcode_line_begin_assertion,
63 : _S_opcode_line_end_assertion,
64 : _S_opcode_word_boundary,
65 : _S_opcode_subexpr_lookahead,
66 : _S_opcode_subexpr_begin,
67 : _S_opcode_subexpr_end,
68 : _S_opcode_dummy,
69 : _S_opcode_match,
70 : _S_opcode_accept,
71 : };
72 :
73 : struct _State_base
74 : {
75 : protected:
76 : _Opcode _M_opcode; // type of outgoing transition
77 :
78 : public:
79 : _StateIdT _M_next; // outgoing transition
80 : union // Since they are mutually exclusive.
81 : {
82 : size_t _M_subexpr; // for _S_opcode_subexpr_*
83 : size_t _M_backref_index; // for _S_opcode_backref
84 : struct
85 : {
86 : // for _S_opcode_alternative, _S_opcode_repeat and
87 : // _S_opcode_subexpr_lookahead
88 : _StateIdT _M_alt;
89 : // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
90 : // quantifiers (ungreedy if set true)
91 : bool _M_neg;
92 : };
93 : // For _S_opcode_match
94 : __gnu_cxx::__aligned_membuf<_Matcher<char>> _M_matcher_storage;
95 : };
96 :
97 : protected:
98 16646 : explicit _State_base(_Opcode __opcode) noexcept
99 16646 : : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
100 16646 : { }
101 :
102 : public:
103 : bool
104 32786 : _M_has_alt() const noexcept
105 : {
106 32786 : return _M_opcode == _S_opcode_alternative
107 32458 : || _M_opcode == _S_opcode_repeat
108 65244 : || _M_opcode == _S_opcode_subexpr_lookahead;
109 : }
110 :
111 : #ifdef _GLIBCXX_DEBUG
112 : std::ostream&
113 : _M_print(std::ostream& ostr) const;
114 :
115 : // Prints graphviz dot commands for state.
116 : std::ostream&
117 : _M_dot(std::ostream& __ostr, _StateIdT __id) const;
118 : #endif
119 : };
120 :
121 : template<typename _Char_type>
122 : struct _State : _State_base
123 : {
124 : typedef _Matcher<_Char_type> _MatcherT;
125 : static_assert(sizeof(_MatcherT) == sizeof(_Matcher<char>),
126 : "std::function<bool(T)> has the same size as "
127 : "std::function<bool(char)>");
128 : static_assert(alignof(_MatcherT) == alignof(_Matcher<char>),
129 : "std::function<bool(T)> has the same alignment as "
130 : "std::function<bool(char)>");
131 :
132 : explicit
133 16646 : _State(_Opcode __opcode) noexcept : _State_base(__opcode)
134 : {
135 16646 : if (_M_opcode() == _S_opcode_match)
136 4883 : new (this->_M_matcher_storage._M_addr()) _MatcherT();
137 16646 : }
138 :
139 5380 : _State(const _State& __rhs) : _State_base(__rhs)
140 : {
141 5380 : if (__rhs._M_opcode() == _S_opcode_match)
142 5380 : new (this->_M_matcher_storage._M_addr())
143 5380 : _MatcherT(__rhs._M_get_matcher());
144 5380 : }
145 :
146 75118 : _State(_State&& __rhs) noexcept : _State_base(__rhs)
147 : {
148 75118 : if (__rhs._M_opcode() == _S_opcode_match)
149 37477 : new (this->_M_matcher_storage._M_addr())
150 37477 : _MatcherT(std::move(__rhs._M_get_matcher()));
151 75118 : }
152 :
153 : _State&
154 : operator=(const _State&) = delete;
155 :
156 97144 : ~_State()
157 : {
158 97144 : if (_M_opcode() == _S_opcode_match)
159 47740 : _M_get_matcher().~_MatcherT();
160 97144 : }
161 :
162 : // Since correct ctor and dtor rely on _M_opcode, it's better not to
163 : // change it over time.
164 : _Opcode
165 2942876 : _M_opcode() const noexcept
166 2942876 : { return _State_base::_M_opcode; }
167 :
168 : bool
169 1350760 : _M_matches(_Char_type __char) const
170 1350760 : { return _M_get_matcher()(__char); }
171 :
172 : _MatcherT&
173 90100 : _M_get_matcher() noexcept
174 90100 : { return *static_cast<_MatcherT*>(this->_M_matcher_storage._M_addr()); }
175 :
176 : const _MatcherT&
177 1356140 : _M_get_matcher() const noexcept
178 : {
179 : return *static_cast<const _MatcherT*>(
180 1356140 : this->_M_matcher_storage._M_addr());
181 : }
182 : };
183 :
184 : struct _NFA_base
185 : {
186 : typedef regex_constants::syntax_option_type _FlagT;
187 :
188 : explicit
189 368 : _NFA_base(_FlagT __f) noexcept
190 736 : : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
191 368 : _M_has_backref(false)
192 368 : { }
193 :
194 : _NFA_base(_NFA_base&&) = default;
195 :
196 : protected:
197 368 : ~_NFA_base() = default;
198 :
199 : public:
200 : _FlagT
201 717 : _M_options() const noexcept
202 717 : { return _M_flags; }
203 :
204 : _StateIdT
205 23504 : _M_start() const noexcept
206 23504 : { return _M_start_state; }
207 :
208 : size_t
209 23136 : _M_sub_count() const noexcept
210 23136 : { return _M_subexpr_count; }
211 :
212 : _GLIBCXX_STD_C::vector<size_t> _M_paren_stack;
213 : _FlagT _M_flags;
214 : _StateIdT _M_start_state;
215 : size_t _M_subexpr_count;
216 : bool _M_has_backref;
217 : };
218 :
219 : template<typename _TraitsT>
220 : struct _NFA
221 : : _NFA_base, _GLIBCXX_STD_C::vector<_State<typename _TraitsT::char_type>>
222 : {
223 : typedef typename _TraitsT::char_type _Char_type;
224 : typedef _State<_Char_type> _StateT;
225 : typedef _Matcher<_Char_type> _MatcherT;
226 :
227 368 : _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags)
228 368 : : _NFA_base(__flags)
229 368 : { _M_traits.imbue(__loc); }
230 :
231 : // for performance reasons _NFA objects should only be moved not copied
232 : _NFA(const _NFA&) = delete;
233 : _NFA(_NFA&&) = default;
234 :
235 : _StateIdT
236 368 : _M_insert_accept()
237 : {
238 368 : auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
239 368 : return __ret;
240 : }
241 :
242 : _StateIdT
243 328 : _M_insert_alt(_StateIdT __next, _StateIdT __alt,
244 : bool __neg __attribute__((__unused__)))
245 : {
246 328 : _StateT __tmp(_S_opcode_alternative);
247 : // It labels every quantifier to make greedy comparison easier in BFS
248 : // approach.
249 328 : __tmp._M_next = __next;
250 328 : __tmp._M_alt = __alt;
251 656 : return _M_insert_state(std::move(__tmp));
252 328 : }
253 :
254 : _StateIdT
255 6255 : _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
256 : {
257 6255 : _StateT __tmp(_S_opcode_repeat);
258 : // It labels every quantifier to make greedy comparison easier in BFS
259 : // approach.
260 6255 : __tmp._M_next = __next;
261 6255 : __tmp._M_alt = __alt;
262 6255 : __tmp._M_neg = __neg;
263 12510 : return _M_insert_state(std::move(__tmp));
264 6255 : }
265 :
266 : _StateIdT
267 4883 : _M_insert_matcher(_MatcherT __m)
268 : {
269 4883 : _StateT __tmp(_S_opcode_match);
270 4883 : __tmp._M_get_matcher() = std::move(__m);
271 9766 : return _M_insert_state(std::move(__tmp));
272 4883 : }
273 :
274 : _StateIdT
275 1055 : _M_insert_subexpr_begin()
276 : {
277 1055 : auto __id = this->_M_subexpr_count++;
278 1055 : this->_M_paren_stack.push_back(__id);
279 1055 : _StateT __tmp(_S_opcode_subexpr_begin);
280 1055 : __tmp._M_subexpr = __id;
281 2110 : return _M_insert_state(std::move(__tmp));
282 1055 : }
283 :
284 : _StateIdT
285 1055 : _M_insert_subexpr_end()
286 : {
287 1055 : _StateT __tmp(_S_opcode_subexpr_end);
288 1055 : __tmp._M_subexpr = this->_M_paren_stack.back();
289 1055 : this->_M_paren_stack.pop_back();
290 2110 : return _M_insert_state(std::move(__tmp));
291 1055 : }
292 :
293 : _StateIdT
294 : _M_insert_backref(size_t __index);
295 :
296 : _StateIdT
297 96 : _M_insert_line_begin()
298 96 : { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
299 :
300 : _StateIdT
301 85 : _M_insert_line_end()
302 85 : { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
303 :
304 : _StateIdT
305 0 : _M_insert_word_bound(bool __neg)
306 : {
307 0 : _StateT __tmp(_S_opcode_word_boundary);
308 0 : __tmp._M_neg = __neg;
309 0 : return _M_insert_state(std::move(__tmp));
310 0 : }
311 :
312 : _StateIdT
313 0 : _M_insert_lookahead(_StateIdT __alt, bool __neg)
314 : {
315 0 : _StateT __tmp(_S_opcode_subexpr_lookahead);
316 0 : __tmp._M_alt = __alt;
317 0 : __tmp._M_neg = __neg;
318 0 : return _M_insert_state(std::move(__tmp));
319 0 : }
320 :
321 : _StateIdT
322 2521 : _M_insert_dummy()
323 2521 : { return _M_insert_state(_StateT(_S_opcode_dummy)); }
324 :
325 : _StateIdT
326 22026 : _M_insert_state(_StateT __s)
327 : {
328 22026 : this->push_back(std::move(__s));
329 22026 : if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT)
330 0 : __throw_regex_error(
331 : regex_constants::error_space,
332 : "Number of NFA states exceeds limit. Please use shorter regex "
333 : "string, or use smaller brace expression, or make "
334 : "_GLIBCXX_REGEX_STATE_LIMIT larger.");
335 22026 : return this->size() - 1;
336 : }
337 :
338 : // Eliminate dummy node in this NFA to make it compact.
339 : void
340 : _M_eliminate_dummy();
341 :
342 : #ifdef _GLIBCXX_DEBUG
343 : std::ostream&
344 : _M_dot(std::ostream& __ostr) const;
345 : #endif
346 : public:
347 : _TraitsT _M_traits;
348 : };
349 :
350 : /// Describes a sequence of one or more %_State, its current start
351 : /// and end(s). This structure contains fragments of an NFA during
352 : /// construction.
353 : template<typename _TraitsT>
354 : class _StateSeq
355 : {
356 : public:
357 : typedef _NFA<_TraitsT> _RegexT;
358 :
359 : public:
360 8319 : _StateSeq(_RegexT& __nfa, _StateIdT __s)
361 8319 : : _M_nfa(__nfa), _M_start(__s), _M_end(__s)
362 8319 : { }
363 :
364 10984 : _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end)
365 10984 : : _M_nfa(__nfa), _M_start(__s), _M_end(__end)
366 10984 : { }
367 :
368 : // Append a state on *this and change *this to the new sequence.
369 : void
370 3739 : _M_append(_StateIdT __id)
371 : {
372 3739 : _M_nfa[_M_end]._M_next = __id;
373 3739 : _M_end = __id;
374 3739 : }
375 :
376 : // Append a sequence on *this and change *this to the new sequence.
377 : void
378 12579 : _M_append(const _StateSeq& __s)
379 : {
380 12579 : _M_nfa[_M_end]._M_next = __s._M_start;
381 12579 : _M_end = __s._M_end;
382 12579 : }
383 :
384 : // Clones an entire sequence.
385 : _StateSeq
386 : _M_clone();
387 :
388 : public:
389 : _RegexT& _M_nfa;
390 : _StateIdT _M_start;
391 : _StateIdT _M_end;
392 : };
393 :
394 : ///@} regex-detail
395 : } // namespace __detail
396 :
397 : _GLIBCXX_END_NAMESPACE_VERSION
398 : } // namespace std
399 :
400 : #include <bits/regex_automaton.tcc>
|