libstdc++
regex_compiler.tcc
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2016 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_compiler.tcc
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 // FIXME make comments doxygen format.
00032 
00033 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
00034 // (http://swtch.com/~rsc/regexp/regexp1.html"),
00035 // but doesn't strictly follow it.
00036 //
00037 // When compiling, states are *chained* instead of tree- or graph-constructed.
00038 // It's more like structured programs: there's if statement and loop statement.
00039 //
00040 // For alternative structure (say "a|b"), aka "if statement", two branches
00041 // should be constructed. However, these two shall merge to an "end_tag" at
00042 // the end of this operator:
00043 //
00044 //                branch1
00045 //              /        \
00046 // => begin_tag            end_tag =>
00047 //              \        /
00048 //                branch2
00049 //
00050 // This is the difference between this implementation and that in Russ's
00051 // article.
00052 //
00053 // That's why we introduced dummy node here ------ "end_tag" is a dummy node.
00054 // All dummy node will be eliminated at the end of compiling process.
00055 
00056 namespace std _GLIBCXX_VISIBILITY(default)
00057 {
00058 namespace __detail
00059 {
00060 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00061 
00062   template<typename _TraitsT>
00063     _Compiler<_TraitsT>::
00064     _Compiler(_IterT __b, _IterT __e,
00065               const typename _TraitsT::locale_type& __loc, _FlagT __flags)
00066     : _M_flags((__flags
00067                 & (regex_constants::ECMAScript
00068                    | regex_constants::basic
00069                    | regex_constants::extended
00070                    | regex_constants::grep
00071                    | regex_constants::egrep
00072                    | regex_constants::awk))
00073                ? __flags
00074                : __flags | regex_constants::ECMAScript),
00075       _M_scanner(__b, __e, _M_flags, __loc),
00076       _M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
00077       _M_traits(_M_nfa->_M_traits),
00078       _M_ctype(std::use_facet<_CtypeT>(__loc))
00079     {
00080       _StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
00081       __r._M_append(_M_nfa->_M_insert_subexpr_begin());
00082       this->_M_disjunction();
00083       if (!_M_match_token(_ScannerT::_S_token_eof))
00084         __throw_regex_error(regex_constants::error_paren);
00085       __r._M_append(_M_pop());
00086       __glibcxx_assert(_M_stack.empty());
00087       __r._M_append(_M_nfa->_M_insert_subexpr_end());
00088       __r._M_append(_M_nfa->_M_insert_accept());
00089       _M_nfa->_M_eliminate_dummy();
00090     }
00091 
00092   template<typename _TraitsT>
00093     void
00094     _Compiler<_TraitsT>::
00095     _M_disjunction()
00096     {
00097       this->_M_alternative();
00098       while (_M_match_token(_ScannerT::_S_token_or))
00099         {
00100           _StateSeqT __alt1 = _M_pop();
00101           this->_M_alternative();
00102           _StateSeqT __alt2 = _M_pop();
00103           auto __end = _M_nfa->_M_insert_dummy();
00104           __alt1._M_append(__end);
00105           __alt2._M_append(__end);
00106           // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
00107           // executes _M_alt before _M_next, as well as executing left
00108           // alternative before right one.
00109           _M_stack.push(_StateSeqT(*_M_nfa,
00110                                    _M_nfa->_M_insert_alt(
00111                                      __alt2._M_start, __alt1._M_start, false),
00112                                    __end));
00113         }
00114     }
00115 
00116   template<typename _TraitsT>
00117     void
00118     _Compiler<_TraitsT>::
00119     _M_alternative()
00120     {
00121       if (this->_M_term())
00122         {
00123           _StateSeqT __re = _M_pop();
00124           this->_M_alternative();
00125           __re._M_append(_M_pop());
00126           _M_stack.push(__re);
00127         }
00128       else
00129         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
00130     }
00131 
00132   template<typename _TraitsT>
00133     bool
00134     _Compiler<_TraitsT>::
00135     _M_term()
00136     {
00137       if (this->_M_assertion())
00138         return true;
00139       if (this->_M_atom())
00140         {
00141           while (this->_M_quantifier());
00142           return true;
00143         }
00144       return false;
00145     }
00146 
00147   template<typename _TraitsT>
00148     bool
00149     _Compiler<_TraitsT>::
00150     _M_assertion()
00151     {
00152       if (_M_match_token(_ScannerT::_S_token_line_begin))
00153         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
00154       else if (_M_match_token(_ScannerT::_S_token_line_end))
00155         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
00156       else if (_M_match_token(_ScannerT::_S_token_word_bound))
00157         // _M_value[0] == 'n' means it's negative, say "not word boundary".
00158         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
00159               _M_insert_word_bound(_M_value[0] == 'n')));
00160       else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
00161         {
00162           auto __neg = _M_value[0] == 'n';
00163           this->_M_disjunction();
00164           if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00165             __throw_regex_error(regex_constants::error_paren,
00166                                 "Parenthesis is not closed.");
00167           auto __tmp = _M_pop();
00168           __tmp._M_append(_M_nfa->_M_insert_accept());
00169           _M_stack.push(
00170               _StateSeqT(
00171                 *_M_nfa,
00172                 _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
00173         }
00174       else
00175         return false;
00176       return true;
00177     }
00178 
00179   template<typename _TraitsT>
00180     bool
00181     _Compiler<_TraitsT>::
00182     _M_quantifier()
00183     {
00184       bool __neg = (_M_flags & regex_constants::ECMAScript);
00185       auto __init = [this, &__neg]()
00186         {
00187           if (_M_stack.empty())
00188             __throw_regex_error(regex_constants::error_badrepeat,
00189                                 "Nothing to repeat before a quantifier.");
00190           __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00191         };
00192       if (_M_match_token(_ScannerT::_S_token_closure0))
00193         {
00194           __init();
00195           auto __e = _M_pop();
00196           _StateSeqT __r(*_M_nfa,
00197                          _M_nfa->_M_insert_repeat(_S_invalid_state_id,
00198                                                   __e._M_start, __neg));
00199           __e._M_append(__r);
00200           _M_stack.push(__r);
00201         }
00202       else if (_M_match_token(_ScannerT::_S_token_closure1))
00203         {
00204           __init();
00205           auto __e = _M_pop();
00206           __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
00207                                                  __e._M_start, __neg));
00208           _M_stack.push(__e);
00209         }
00210       else if (_M_match_token(_ScannerT::_S_token_opt))
00211         {
00212           __init();
00213           auto __e = _M_pop();
00214           auto __end = _M_nfa->_M_insert_dummy();
00215           _StateSeqT __r(*_M_nfa,
00216                          _M_nfa->_M_insert_repeat(_S_invalid_state_id,
00217                                                   __e._M_start, __neg));
00218           __e._M_append(__end);
00219           __r._M_append(__end);
00220           _M_stack.push(__r);
00221         }
00222       else if (_M_match_token(_ScannerT::_S_token_interval_begin))
00223         {
00224           if (_M_stack.empty())
00225             __throw_regex_error(regex_constants::error_badrepeat,
00226                                 "Nothing to repeat before a quantifier.");
00227           if (!_M_match_token(_ScannerT::_S_token_dup_count))
00228             __throw_regex_error(regex_constants::error_badbrace,
00229                                 "Unexpected token in brace expression.");
00230           _StateSeqT __r(_M_pop());
00231           _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
00232           long __min_rep = _M_cur_int_value(10);
00233           bool __infi = false;
00234           long __n;
00235 
00236           // {3
00237           if (_M_match_token(_ScannerT::_S_token_comma))
00238             if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
00239               __n = _M_cur_int_value(10) - __min_rep;
00240             else
00241               __infi = true;
00242           else
00243             __n = 0;
00244           if (!_M_match_token(_ScannerT::_S_token_interval_end))
00245             __throw_regex_error(regex_constants::error_brace,
00246                                 "Unexpected end of brace expression.");
00247 
00248           __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00249 
00250           for (long __i = 0; __i < __min_rep; ++__i)
00251             __e._M_append(__r._M_clone());
00252 
00253           if (__infi)
00254             {
00255               auto __tmp = __r._M_clone();
00256               _StateSeqT __s(*_M_nfa,
00257                              _M_nfa->_M_insert_repeat(_S_invalid_state_id,
00258                                                       __tmp._M_start, __neg));
00259               __tmp._M_append(__s);
00260               __e._M_append(__s);
00261             }
00262           else
00263             {
00264               if (__n < 0)
00265                 __throw_regex_error(regex_constants::error_badbrace,
00266                                     "Invalid range in brace expression.");
00267               auto __end = _M_nfa->_M_insert_dummy();
00268               // _M_alt is the "match more" branch, and _M_next is the
00269               // "match less" one. Switch _M_alt and _M_next of all created
00270               // nodes. This is a hack but IMO works well.
00271               std::stack<_StateIdT> __stack;
00272               for (long __i = 0; __i < __n; ++__i)
00273                 {
00274                   auto __tmp = __r._M_clone();
00275                   auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
00276                                                         __end, __neg);
00277                   __stack.push(__alt);
00278                   __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
00279                 }
00280               __e._M_append(__end);
00281               while (!__stack.empty())
00282                 {
00283                   auto& __tmp = (*_M_nfa)[__stack.top()];
00284                   __stack.pop();
00285                   std::swap(__tmp._M_next, __tmp._M_alt);
00286                 }
00287             }
00288           _M_stack.push(__e);
00289         }
00290       else
00291         return false;
00292       return true;
00293     }
00294 
00295 #define __INSERT_REGEX_MATCHER(__func, args...)\
00296         do\
00297           if (!(_M_flags & regex_constants::icase))\
00298             if (!(_M_flags & regex_constants::collate))\
00299               __func<false, false>(args);\
00300             else\
00301               __func<false, true>(args);\
00302           else\
00303             if (!(_M_flags & regex_constants::collate))\
00304               __func<true, false>(args);\
00305             else\
00306               __func<true, true>(args);\
00307         while (false)
00308 
00309   template<typename _TraitsT>
00310     bool
00311     _Compiler<_TraitsT>::
00312     _M_atom()
00313     {
00314       if (_M_match_token(_ScannerT::_S_token_anychar))
00315         {
00316           if (!(_M_flags & regex_constants::ECMAScript))
00317             __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
00318           else
00319             __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
00320         }
00321       else if (_M_try_char())
00322         __INSERT_REGEX_MATCHER(_M_insert_char_matcher);
00323       else if (_M_match_token(_ScannerT::_S_token_backref))
00324         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
00325                                  _M_insert_backref(_M_cur_int_value(10))));
00326       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00327         __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
00328       else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
00329         {
00330           _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
00331           this->_M_disjunction();
00332           if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00333             __throw_regex_error(regex_constants::error_paren,
00334                                 "Parenthesis is not closed.");
00335           __r._M_append(_M_pop());
00336           _M_stack.push(__r);
00337         }
00338       else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
00339         {
00340           _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
00341           this->_M_disjunction();
00342           if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00343             __throw_regex_error(regex_constants::error_paren,
00344                                 "Parenthesis is not closed.");
00345           __r._M_append(_M_pop());
00346           __r._M_append(_M_nfa->_M_insert_subexpr_end());
00347           _M_stack.push(__r);
00348         }
00349       else if (!_M_bracket_expression())
00350         return false;
00351       return true;
00352     }
00353 
00354   template<typename _TraitsT>
00355     bool
00356     _Compiler<_TraitsT>::
00357     _M_bracket_expression()
00358     {
00359       bool __neg =
00360         _M_match_token(_ScannerT::_S_token_bracket_neg_begin);
00361       if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
00362         return false;
00363       __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
00364       return true;
00365     }
00366 #undef __INSERT_REGEX_MATCHER
00367 
00368   template<typename _TraitsT>
00369   template<bool __icase, bool __collate>
00370     void
00371     _Compiler<_TraitsT>::
00372     _M_insert_any_matcher_ecma()
00373     {
00374       _M_stack.push(_StateSeqT(*_M_nfa,
00375         _M_nfa->_M_insert_matcher
00376           (_AnyMatcher<_TraitsT, true, __icase, __collate>
00377             (_M_traits))));
00378     }
00379 
00380   template<typename _TraitsT>
00381   template<bool __icase, bool __collate>
00382     void
00383     _Compiler<_TraitsT>::
00384     _M_insert_any_matcher_posix()
00385     {
00386       _M_stack.push(_StateSeqT(*_M_nfa,
00387         _M_nfa->_M_insert_matcher
00388           (_AnyMatcher<_TraitsT, false, __icase, __collate>
00389             (_M_traits))));
00390     }
00391 
00392   template<typename _TraitsT>
00393   template<bool __icase, bool __collate>
00394     void
00395     _Compiler<_TraitsT>::
00396     _M_insert_char_matcher()
00397     {
00398       _M_stack.push(_StateSeqT(*_M_nfa,
00399         _M_nfa->_M_insert_matcher
00400           (_CharMatcher<_TraitsT, __icase, __collate>
00401             (_M_value[0], _M_traits))));
00402     }
00403 
00404   template<typename _TraitsT>
00405   template<bool __icase, bool __collate>
00406     void
00407     _Compiler<_TraitsT>::
00408     _M_insert_character_class_matcher()
00409     {
00410       __glibcxx_assert(_M_value.size() == 1);
00411       _BracketMatcher<_TraitsT, __icase, __collate> __matcher
00412         (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
00413       __matcher._M_add_character_class(_M_value, false);
00414       __matcher._M_ready();
00415       _M_stack.push(_StateSeqT(*_M_nfa,
00416         _M_nfa->_M_insert_matcher(std::move(__matcher))));
00417     }
00418 
00419   template<typename _TraitsT>
00420   template<bool __icase, bool __collate>
00421     void
00422     _Compiler<_TraitsT>::
00423     _M_insert_bracket_matcher(bool __neg)
00424     {
00425       _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
00426       pair<bool, _CharT> __last_char; // Optional<_CharT>
00427       __last_char.first = false;
00428       if (!(_M_flags & regex_constants::ECMAScript))
00429         if (_M_try_char())
00430           {
00431             __matcher._M_add_char(_M_value[0]);
00432             __last_char.first = true;
00433             __last_char.second = _M_value[0];
00434           }
00435       while (_M_expression_term(__last_char, __matcher));
00436       __matcher._M_ready();
00437       _M_stack.push(_StateSeqT(
00438                       *_M_nfa,
00439                       _M_nfa->_M_insert_matcher(std::move(__matcher))));
00440     }
00441 
00442   template<typename _TraitsT>
00443   template<bool __icase, bool __collate>
00444     bool
00445     _Compiler<_TraitsT>::
00446     _M_expression_term(pair<bool, _CharT>& __last_char,
00447                        _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
00448     {
00449       if (_M_match_token(_ScannerT::_S_token_bracket_end))
00450         return false;
00451 
00452       if (_M_match_token(_ScannerT::_S_token_collsymbol))
00453         {
00454           auto __symbol = __matcher._M_add_collate_element(_M_value);
00455           if (__symbol.size() == 1)
00456             {
00457               __last_char.first = true;
00458               __last_char.second = __symbol[0];
00459             }
00460         }
00461       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
00462         __matcher._M_add_equivalence_class(_M_value);
00463       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
00464         __matcher._M_add_character_class(_M_value, false);
00465       // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
00466       // except when the '-' is the first or last character in the bracket
00467       // expression ([--0]). ECMAScript treats all '-' after a range as a
00468       // normal character. Also see above, where _M_expression_term gets called.
00469       //
00470       // As a result, POSIX rejects [-----], but ECMAScript doesn't.
00471       // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
00472       // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
00473       //
00474       // It turns out that no one reads BNFs ;)
00475       else if (_M_try_char())
00476         {
00477           if (!__last_char.first)
00478             {
00479               __matcher._M_add_char(_M_value[0]);
00480               if (_M_value[0] == '-'
00481                   && !(_M_flags & regex_constants::ECMAScript))
00482                 {
00483                   if (_M_match_token(_ScannerT::_S_token_bracket_end))
00484                     return false;
00485                   __throw_regex_error(
00486                     regex_constants::error_range,
00487                     "Unexpected dash in bracket expression. For POSIX syntax, "
00488                     "a dash is not treated literally only when it is at "
00489                     "beginning or end.");
00490                 }
00491               __last_char.first = true;
00492               __last_char.second = _M_value[0];
00493             }
00494           else
00495             {
00496               if (_M_value[0] == '-')
00497                 {
00498                   if (_M_try_char())
00499                     {
00500                       __matcher._M_make_range(__last_char.second , _M_value[0]);
00501                       __last_char.first = false;
00502                     }
00503                   else
00504                     {
00505                       if (_M_scanner._M_get_token()
00506                           != _ScannerT::_S_token_bracket_end)
00507                         __throw_regex_error(
00508                           regex_constants::error_range,
00509                           "Unexpected end of bracket expression.");
00510                       __matcher._M_add_char(_M_value[0]);
00511                     }
00512                 }
00513               else
00514                 {
00515                   __matcher._M_add_char(_M_value[0]);
00516                   __last_char.second = _M_value[0];
00517                 }
00518             }
00519         }
00520       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00521         __matcher._M_add_character_class(_M_value,
00522                                          _M_ctype.is(_CtypeT::upper,
00523                                                      _M_value[0]));
00524       else
00525         __throw_regex_error(regex_constants::error_brack,
00526                             "Unexpected character in bracket expression.");
00527 
00528       return true;
00529     }
00530 
00531   template<typename _TraitsT>
00532     bool
00533     _Compiler<_TraitsT>::
00534     _M_try_char()
00535     {
00536       bool __is_char = false;
00537       if (_M_match_token(_ScannerT::_S_token_oct_num))
00538         {
00539           __is_char = true;
00540           _M_value.assign(1, _M_cur_int_value(8));
00541         }
00542       else if (_M_match_token(_ScannerT::_S_token_hex_num))
00543         {
00544           __is_char = true;
00545           _M_value.assign(1, _M_cur_int_value(16));
00546         }
00547       else if (_M_match_token(_ScannerT::_S_token_ord_char))
00548         __is_char = true;
00549       return __is_char;
00550     }
00551 
00552   template<typename _TraitsT>
00553     bool
00554     _Compiler<_TraitsT>::
00555     _M_match_token(_TokenT token)
00556     {
00557       if (token == _M_scanner._M_get_token())
00558         {
00559           _M_value = _M_scanner._M_get_value();
00560           _M_scanner._M_advance();
00561           return true;
00562         }
00563       return false;
00564     }
00565 
00566   template<typename _TraitsT>
00567     int
00568     _Compiler<_TraitsT>::
00569     _M_cur_int_value(int __radix)
00570     {
00571       long __v = 0;
00572       for (typename _StringT::size_type __i = 0;
00573            __i < _M_value.length(); ++__i)
00574         __v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
00575       return __v;
00576     }
00577 
00578   template<typename _TraitsT, bool __icase, bool __collate>
00579     bool
00580     _BracketMatcher<_TraitsT, __icase, __collate>::
00581     _M_apply(_CharT __ch, false_type) const
00582     {
00583       bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(),
00584                                       _M_translator._M_translate(__ch));
00585       if (!__ret)
00586         {
00587           auto __s = _M_translator._M_transform(__ch);
00588           for (auto& __it : _M_range_set)
00589             if (__it.first <= __s && __s <= __it.second)
00590               {
00591                 __ret = true;
00592                 break;
00593               }
00594           if (_M_traits.isctype(__ch, _M_class_set))
00595             __ret = true;
00596           else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
00597                              _M_traits.transform_primary(&__ch, &__ch+1))
00598                    != _M_equiv_set.end())
00599             __ret = true;
00600           else
00601             {
00602               for (auto& __it : _M_neg_class_set)
00603                 if (!_M_traits.isctype(__ch, __it))
00604                   {
00605                     __ret = true;
00606                     break;
00607                   }
00608             }
00609         }
00610       if (_M_is_non_matching)
00611         return !__ret;
00612       else
00613         return __ret;
00614     }
00615 
00616 _GLIBCXX_END_NAMESPACE_VERSION
00617 } // namespace __detail
00618 } // namespace