libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2016 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_scanner.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // FIXME make comments doxygen format. 00032 00033 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 00034 // and awk 00035 // 1) grep is basic except '\n' is treated as '|' 00036 // 2) egrep is extended except '\n' is treated as '|' 00037 // 3) awk is extended except special escaping rules, and there's no 00038 // back-reference. 00039 // 00040 // References: 00041 // 00042 // ECMAScript: ECMA-262 15.10 00043 // 00044 // basic, extended: 00045 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 00046 // 00047 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 00048 00049 namespace std _GLIBCXX_VISIBILITY(default) 00050 { 00051 namespace __detail 00052 { 00053 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00054 00055 template<typename _CharT> 00056 _Scanner<_CharT>:: 00057 _Scanner(typename _Scanner::_IterT __begin, 00058 typename _Scanner::_IterT __end, 00059 _FlagT __flags, std::locale __loc) 00060 : _ScannerBase(__flags), 00061 _M_current(__begin), _M_end(__end), 00062 _M_ctype(std::use_facet<_CtypeT>(__loc)), 00063 _M_eat_escape(_M_is_ecma() 00064 ? &_Scanner::_M_eat_escape_ecma 00065 : &_Scanner::_M_eat_escape_posix) 00066 { _M_advance(); } 00067 00068 template<typename _CharT> 00069 void 00070 _Scanner<_CharT>:: 00071 _M_advance() 00072 { 00073 if (_M_current == _M_end) 00074 { 00075 _M_token = _S_token_eof; 00076 return; 00077 } 00078 00079 if (_M_state == _S_state_normal) 00080 _M_scan_normal(); 00081 else if (_M_state == _S_state_in_bracket) 00082 _M_scan_in_bracket(); 00083 else if (_M_state == _S_state_in_brace) 00084 _M_scan_in_brace(); 00085 else 00086 { 00087 __glibcxx_assert(false); 00088 } 00089 } 00090 00091 // Differences between styles: 00092 // 1) "\(", "\)", "\{" in basic. It's not escaping. 00093 // 2) "(?:", "(?=", "(?!" in ECMAScript. 00094 template<typename _CharT> 00095 void 00096 _Scanner<_CharT>:: 00097 _M_scan_normal() 00098 { 00099 auto __c = *_M_current++; 00100 00101 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr) 00102 { 00103 _M_token = _S_token_ord_char; 00104 _M_value.assign(1, __c); 00105 return; 00106 } 00107 if (__c == '\\') 00108 { 00109 if (_M_current == _M_end) 00110 __throw_regex_error( 00111 regex_constants::error_escape, 00112 "Unexpected end of regex when escaping."); 00113 00114 if (!_M_is_basic() 00115 || (*_M_current != '(' 00116 && *_M_current != ')' 00117 && *_M_current != '{')) 00118 { 00119 (this->*_M_eat_escape)(); 00120 return; 00121 } 00122 __c = *_M_current++; 00123 } 00124 if (__c == '(') 00125 { 00126 if (_M_is_ecma() && *_M_current == '?') 00127 { 00128 if (++_M_current == _M_end) 00129 __throw_regex_error( 00130 regex_constants::error_paren, 00131 "Unexpected end of regex when in an open parenthesis."); 00132 00133 if (*_M_current == ':') 00134 { 00135 ++_M_current; 00136 _M_token = _S_token_subexpr_no_group_begin; 00137 } 00138 else if (*_M_current == '=') 00139 { 00140 ++_M_current; 00141 _M_token = _S_token_subexpr_lookahead_begin; 00142 _M_value.assign(1, 'p'); 00143 } 00144 else if (*_M_current == '!') 00145 { 00146 ++_M_current; 00147 _M_token = _S_token_subexpr_lookahead_begin; 00148 _M_value.assign(1, 'n'); 00149 } 00150 else 00151 __throw_regex_error( 00152 regex_constants::error_paren, 00153 "Invalid special open parenthesis."); 00154 } 00155 else if (_M_flags & regex_constants::nosubs) 00156 _M_token = _S_token_subexpr_no_group_begin; 00157 else 00158 _M_token = _S_token_subexpr_begin; 00159 } 00160 else if (__c == ')') 00161 _M_token = _S_token_subexpr_end; 00162 else if (__c == '[') 00163 { 00164 _M_state = _S_state_in_bracket; 00165 _M_at_bracket_start = true; 00166 if (_M_current != _M_end && *_M_current == '^') 00167 { 00168 _M_token = _S_token_bracket_neg_begin; 00169 ++_M_current; 00170 } 00171 else 00172 _M_token = _S_token_bracket_begin; 00173 } 00174 else if (__c == '{') 00175 { 00176 _M_state = _S_state_in_brace; 00177 _M_token = _S_token_interval_begin; 00178 } 00179 else if (__c != ']' && __c != '}') 00180 { 00181 auto __it = _M_token_tbl; 00182 auto __narrowc = _M_ctype.narrow(__c, '\0'); 00183 for (; __it->first != '\0'; ++__it) 00184 if (__it->first == __narrowc) 00185 { 00186 _M_token = __it->second; 00187 return; 00188 } 00189 __glibcxx_assert(false); 00190 } 00191 else 00192 { 00193 _M_token = _S_token_ord_char; 00194 _M_value.assign(1, __c); 00195 } 00196 } 00197 00198 // Differences between styles: 00199 // 1) different semantics of "[]" and "[^]". 00200 // 2) Escaping in bracket expr. 00201 template<typename _CharT> 00202 void 00203 _Scanner<_CharT>:: 00204 _M_scan_in_bracket() 00205 { 00206 if (_M_current == _M_end) 00207 __throw_regex_error( 00208 regex_constants::error_brack, 00209 "Unexpected end of regex when in bracket expression."); 00210 00211 auto __c = *_M_current++; 00212 00213 if (__c == '[') 00214 { 00215 if (_M_current == _M_end) 00216 __throw_regex_error(regex_constants::error_brack, 00217 "Unexpected character class open bracket."); 00218 00219 if (*_M_current == '.') 00220 { 00221 _M_token = _S_token_collsymbol; 00222 _M_eat_class(*_M_current++); 00223 } 00224 else if (*_M_current == ':') 00225 { 00226 _M_token = _S_token_char_class_name; 00227 _M_eat_class(*_M_current++); 00228 } 00229 else if (*_M_current == '=') 00230 { 00231 _M_token = _S_token_equiv_class_name; 00232 _M_eat_class(*_M_current++); 00233 } 00234 else 00235 { 00236 _M_token = _S_token_ord_char; 00237 _M_value.assign(1, __c); 00238 } 00239 } 00240 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 00241 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases 00242 // `*/empty_range.cc`. 00243 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 00244 { 00245 _M_token = _S_token_bracket_end; 00246 _M_state = _S_state_normal; 00247 } 00248 // ECMAScript and awk permits escaping in bracket. 00249 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 00250 (this->*_M_eat_escape)(); 00251 else 00252 { 00253 _M_token = _S_token_ord_char; 00254 _M_value.assign(1, __c); 00255 } 00256 _M_at_bracket_start = false; 00257 } 00258 00259 // Differences between styles: 00260 // 1) "\}" in basic style. 00261 template<typename _CharT> 00262 void 00263 _Scanner<_CharT>:: 00264 _M_scan_in_brace() 00265 { 00266 if (_M_current == _M_end) 00267 __throw_regex_error( 00268 regex_constants::error_brace, 00269 "Unexpected end of regex when in brace expression."); 00270 00271 auto __c = *_M_current++; 00272 00273 if (_M_ctype.is(_CtypeT::digit, __c)) 00274 { 00275 _M_token = _S_token_dup_count; 00276 _M_value.assign(1, __c); 00277 while (_M_current != _M_end 00278 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00279 _M_value += *_M_current++; 00280 } 00281 else if (__c == ',') 00282 _M_token = _S_token_comma; 00283 // basic use \}. 00284 else if (_M_is_basic()) 00285 { 00286 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 00287 { 00288 _M_state = _S_state_normal; 00289 _M_token = _S_token_interval_end; 00290 ++_M_current; 00291 } 00292 else 00293 __throw_regex_error(regex_constants::error_badbrace, 00294 "Unexpected character in brace expression."); 00295 } 00296 else if (__c == '}') 00297 { 00298 _M_state = _S_state_normal; 00299 _M_token = _S_token_interval_end; 00300 } 00301 else 00302 __throw_regex_error(regex_constants::error_badbrace, 00303 "Unexpected character in brace expression."); 00304 } 00305 00306 template<typename _CharT> 00307 void 00308 _Scanner<_CharT>:: 00309 _M_eat_escape_ecma() 00310 { 00311 if (_M_current == _M_end) 00312 __throw_regex_error(regex_constants::error_escape, 00313 "Unexpected end of regex when escaping."); 00314 00315 auto __c = *_M_current++; 00316 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00317 00318 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 00319 { 00320 _M_token = _S_token_ord_char; 00321 _M_value.assign(1, *__pos); 00322 } 00323 else if (__c == 'b') 00324 { 00325 _M_token = _S_token_word_bound; 00326 _M_value.assign(1, 'p'); 00327 } 00328 else if (__c == 'B') 00329 { 00330 _M_token = _S_token_word_bound; 00331 _M_value.assign(1, 'n'); 00332 } 00333 // N3376 28.13 00334 else if (__c == 'd' 00335 || __c == 'D' 00336 || __c == 's' 00337 || __c == 'S' 00338 || __c == 'w' 00339 || __c == 'W') 00340 { 00341 _M_token = _S_token_quoted_class; 00342 _M_value.assign(1, __c); 00343 } 00344 else if (__c == 'c') 00345 { 00346 if (_M_current == _M_end) 00347 __throw_regex_error( 00348 regex_constants::error_escape, 00349 "Unexpected end of regex when reading control code."); 00350 _M_token = _S_token_ord_char; 00351 _M_value.assign(1, *_M_current++); 00352 } 00353 else if (__c == 'x' || __c == 'u') 00354 { 00355 _M_value.erase(); 00356 for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) 00357 { 00358 if (_M_current == _M_end 00359 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 00360 __throw_regex_error( 00361 regex_constants::error_escape, 00362 "Unexpected end of regex when ascii character."); 00363 _M_value += *_M_current++; 00364 } 00365 _M_token = _S_token_hex_num; 00366 } 00367 // ECMAScript recognizes multi-digit back-references. 00368 else if (_M_ctype.is(_CtypeT::digit, __c)) 00369 { 00370 _M_value.assign(1, __c); 00371 while (_M_current != _M_end 00372 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00373 _M_value += *_M_current++; 00374 _M_token = _S_token_backref; 00375 } 00376 else 00377 { 00378 _M_token = _S_token_ord_char; 00379 _M_value.assign(1, __c); 00380 } 00381 } 00382 00383 // Differences between styles: 00384 // 1) Extended doesn't support backref, but basic does. 00385 template<typename _CharT> 00386 void 00387 _Scanner<_CharT>:: 00388 _M_eat_escape_posix() 00389 { 00390 if (_M_current == _M_end) 00391 __throw_regex_error(regex_constants::error_escape, 00392 "Unexpected end of regex when escaping."); 00393 00394 auto __c = *_M_current; 00395 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 00396 00397 if (__pos != nullptr && *__pos != '\0') 00398 { 00399 _M_token = _S_token_ord_char; 00400 _M_value.assign(1, __c); 00401 } 00402 // We MUST judge awk before handling backrefs. There's no backref in awk. 00403 else if (_M_is_awk()) 00404 { 00405 _M_eat_escape_awk(); 00406 return; 00407 } 00408 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 00409 { 00410 _M_token = _S_token_backref; 00411 _M_value.assign(1, __c); 00412 } 00413 else 00414 { 00415 #ifdef __STRICT_ANSI__ 00416 // POSIX says it is undefined to escape ordinary characters 00417 __throw_regex_error(regex_constants::error_escape, 00418 "Unexpected escape character."); 00419 #else 00420 _M_token = _S_token_ord_char; 00421 _M_value.assign(1, __c); 00422 #endif 00423 } 00424 ++_M_current; 00425 } 00426 00427 template<typename _CharT> 00428 void 00429 _Scanner<_CharT>:: 00430 _M_eat_escape_awk() 00431 { 00432 auto __c = *_M_current++; 00433 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00434 00435 if (__pos != nullptr) 00436 { 00437 _M_token = _S_token_ord_char; 00438 _M_value.assign(1, *__pos); 00439 } 00440 // \ddd for oct representation 00441 else if (_M_ctype.is(_CtypeT::digit, __c) 00442 && __c != '8' 00443 && __c != '9') 00444 { 00445 _M_value.assign(1, __c); 00446 for (int __i = 0; 00447 __i < 2 00448 && _M_current != _M_end 00449 && _M_ctype.is(_CtypeT::digit, *_M_current) 00450 && *_M_current != '8' 00451 && *_M_current != '9'; 00452 __i++) 00453 _M_value += *_M_current++; 00454 _M_token = _S_token_oct_num; 00455 return; 00456 } 00457 else 00458 __throw_regex_error(regex_constants::error_escape, 00459 "Unexpected escape character."); 00460 } 00461 00462 // Eats a character class or throws an exception. 00463 // __ch could be ':', '.' or '=', _M_current is the char after ']' when 00464 // returning. 00465 template<typename _CharT> 00466 void 00467 _Scanner<_CharT>:: 00468 _M_eat_class(char __ch) 00469 { 00470 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 00471 _M_value += *_M_current++; 00472 if (_M_current == _M_end 00473 || *_M_current++ != __ch 00474 || _M_current == _M_end // skip __ch 00475 || *_M_current++ != ']') // skip ']' 00476 { 00477 if (__ch == ':') 00478 __throw_regex_error(regex_constants::error_ctype, 00479 "Unexpected end of character class."); 00480 else 00481 __throw_regex_error(regex_constants::error_collate, 00482 "Unexpected end of character class."); 00483 } 00484 } 00485 00486 #ifdef _GLIBCXX_DEBUG 00487 template<typename _CharT> 00488 std::ostream& 00489 _Scanner<_CharT>:: 00490 _M_print(std::ostream& ostr) 00491 { 00492 switch (_M_token) 00493 { 00494 case _S_token_anychar: 00495 ostr << "any-character\n"; 00496 break; 00497 case _S_token_backref: 00498 ostr << "backref\n"; 00499 break; 00500 case _S_token_bracket_begin: 00501 ostr << "bracket-begin\n"; 00502 break; 00503 case _S_token_bracket_neg_begin: 00504 ostr << "bracket-neg-begin\n"; 00505 break; 00506 case _S_token_bracket_end: 00507 ostr << "bracket-end\n"; 00508 break; 00509 case _S_token_char_class_name: 00510 ostr << "char-class-name \"" << _M_value << "\"\n"; 00511 break; 00512 case _S_token_closure0: 00513 ostr << "closure0\n"; 00514 break; 00515 case _S_token_closure1: 00516 ostr << "closure1\n"; 00517 break; 00518 case _S_token_collsymbol: 00519 ostr << "collsymbol \"" << _M_value << "\"\n"; 00520 break; 00521 case _S_token_comma: 00522 ostr << "comma\n"; 00523 break; 00524 case _S_token_dup_count: 00525 ostr << "dup count: " << _M_value << "\n"; 00526 break; 00527 case _S_token_eof: 00528 ostr << "EOF\n"; 00529 break; 00530 case _S_token_equiv_class_name: 00531 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 00532 break; 00533 case _S_token_interval_begin: 00534 ostr << "interval begin\n"; 00535 break; 00536 case _S_token_interval_end: 00537 ostr << "interval end\n"; 00538 break; 00539 case _S_token_line_begin: 00540 ostr << "line begin\n"; 00541 break; 00542 case _S_token_line_end: 00543 ostr << "line end\n"; 00544 break; 00545 case _S_token_opt: 00546 ostr << "opt\n"; 00547 break; 00548 case _S_token_or: 00549 ostr << "or\n"; 00550 break; 00551 case _S_token_ord_char: 00552 ostr << "ordinary character: \"" << _M_value << "\"\n"; 00553 break; 00554 case _S_token_subexpr_begin: 00555 ostr << "subexpr begin\n"; 00556 break; 00557 case _S_token_subexpr_no_group_begin: 00558 ostr << "no grouping subexpr begin\n"; 00559 break; 00560 case _S_token_subexpr_lookahead_begin: 00561 ostr << "lookahead subexpr begin\n"; 00562 break; 00563 case _S_token_subexpr_end: 00564 ostr << "subexpr end\n"; 00565 break; 00566 case _S_token_unknown: 00567 ostr << "-- unknown token --\n"; 00568 break; 00569 case _S_token_oct_num: 00570 ostr << "oct number " << _M_value << "\n"; 00571 break; 00572 case _S_token_hex_num: 00573 ostr << "hex number " << _M_value << "\n"; 00574 break; 00575 case _S_token_quoted_class: 00576 ostr << "quoted class " << "\\" << _M_value << "\n"; 00577 break; 00578 default: 00579 _GLIBCXX_DEBUG_ASSERT(false); 00580 } 00581 return ostr; 00582 } 00583 #endif 00584 00585 _GLIBCXX_END_NAMESPACE_VERSION 00586 } // namespace __detail 00587 } // namespace