jinja2.lexer
Implements a Jinja / Python combination lexer. The Lexer
class
is used to do some preprocessing. It filters out invalid operators like
the bitshift operators we don't allow in templates. It separates
template code and python code in expressions.
1"""Implements a Jinja / Python combination lexer. The ``Lexer`` class 2is used to do some preprocessing. It filters out invalid operators like 3the bitshift operators we don't allow in templates. It separates 4template code and python code in expressions. 5""" 6 7import re 8import typing as t 9from ast import literal_eval 10from collections import deque 11from sys import intern 12 13from ._identifier import pattern as name_re 14from .exceptions import TemplateSyntaxError 15from .utils import LRUCache 16 17if t.TYPE_CHECKING: 18 import typing_extensions as te 19 20 from .environment import Environment 21 22# cache for the lexers. Exists in order to be able to have multiple 23# environments with the same lexer 24_lexer_cache: t.MutableMapping[t.Tuple, "Lexer"] = LRUCache(50) # type: ignore 25 26# static regular expressions 27whitespace_re = re.compile(r"\s+") 28newline_re = re.compile(r"(\r\n|\r|\n)") 29string_re = re.compile( 30 r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S 31) 32integer_re = re.compile( 33 r""" 34 ( 35 0b(_?[0-1])+ # binary 36 | 37 0o(_?[0-7])+ # octal 38 | 39 0x(_?[\da-f])+ # hex 40 | 41 [1-9](_?\d)* # decimal 42 | 43 0(_?0)* # decimal zero 44 ) 45 """, 46 re.IGNORECASE | re.VERBOSE, 47) 48float_re = re.compile( 49 r""" 50 (?<!\.) # doesn't start with a . 51 (\d+_)*\d+ # digits, possibly _ separated 52 ( 53 (\.(\d+_)*\d+)? # optional fractional part 54 e[+\-]?(\d+_)*\d+ # exponent part 55 | 56 \.(\d+_)*\d+ # required fractional part 57 ) 58 """, 59 re.IGNORECASE | re.VERBOSE, 60) 61 62# internal the tokens and keep references to them 63TOKEN_ADD = intern("add") 64TOKEN_ASSIGN = intern("assign") 65TOKEN_COLON = intern("colon") 66TOKEN_COMMA = intern("comma") 67TOKEN_DIV = intern("div") 68TOKEN_DOT = intern("dot") 69TOKEN_EQ = intern("eq") 70TOKEN_FLOORDIV = intern("floordiv") 71TOKEN_GT = intern("gt") 72TOKEN_GTEQ = intern("gteq") 73TOKEN_LBRACE = intern("lbrace") 74TOKEN_LBRACKET = intern("lbracket") 75TOKEN_LPAREN = intern("lparen") 76TOKEN_LT = intern("lt") 77TOKEN_LTEQ = intern("lteq") 78TOKEN_MOD = intern("mod") 79TOKEN_MUL = intern("mul") 80TOKEN_NE = intern("ne") 81TOKEN_PIPE = intern("pipe") 82TOKEN_POW = intern("pow") 83TOKEN_RBRACE = intern("rbrace") 84TOKEN_RBRACKET = intern("rbracket") 85TOKEN_RPAREN = intern("rparen") 86TOKEN_SEMICOLON = intern("semicolon") 87TOKEN_SUB = intern("sub") 88TOKEN_TILDE = intern("tilde") 89TOKEN_WHITESPACE = intern("whitespace") 90TOKEN_FLOAT = intern("float") 91TOKEN_INTEGER = intern("integer") 92TOKEN_NAME = intern("name") 93TOKEN_STRING = intern("string") 94TOKEN_OPERATOR = intern("operator") 95TOKEN_BLOCK_BEGIN = intern("block_begin") 96TOKEN_BLOCK_END = intern("block_end") 97TOKEN_VARIABLE_BEGIN = intern("variable_begin") 98TOKEN_VARIABLE_END = intern("variable_end") 99TOKEN_RAW_BEGIN = intern("raw_begin") 100TOKEN_RAW_END = intern("raw_end") 101TOKEN_COMMENT_BEGIN = intern("comment_begin") 102TOKEN_COMMENT_END = intern("comment_end") 103TOKEN_COMMENT = intern("comment") 104TOKEN_LINESTATEMENT_BEGIN = intern("linestatement_begin") 105TOKEN_LINESTATEMENT_END = intern("linestatement_end") 106TOKEN_LINECOMMENT_BEGIN = intern("linecomment_begin") 107TOKEN_LINECOMMENT_END = intern("linecomment_end") 108TOKEN_LINECOMMENT = intern("linecomment") 109TOKEN_DATA = intern("data") 110TOKEN_INITIAL = intern("initial") 111TOKEN_EOF = intern("eof") 112 113# bind operators to token types 114operators = { 115 "+": TOKEN_ADD, 116 "-": TOKEN_SUB, 117 "/": TOKEN_DIV, 118 "//": TOKEN_FLOORDIV, 119 "*": TOKEN_MUL, 120 "%": TOKEN_MOD, 121 "**": TOKEN_POW, 122 "~": TOKEN_TILDE, 123 "[": TOKEN_LBRACKET, 124 "]": TOKEN_RBRACKET, 125 "(": TOKEN_LPAREN, 126 ")": TOKEN_RPAREN, 127 "{": TOKEN_LBRACE, 128 "}": TOKEN_RBRACE, 129 "==": TOKEN_EQ, 130 "!=": TOKEN_NE, 131 ">": TOKEN_GT, 132 ">=": TOKEN_GTEQ, 133 "<": TOKEN_LT, 134 "<=": TOKEN_LTEQ, 135 "=": TOKEN_ASSIGN, 136 ".": TOKEN_DOT, 137 ":": TOKEN_COLON, 138 "|": TOKEN_PIPE, 139 ",": TOKEN_COMMA, 140 ";": TOKEN_SEMICOLON, 141} 142 143reverse_operators = {v: k for k, v in operators.items()} 144assert len(operators) == len(reverse_operators), "operators dropped" 145operator_re = re.compile( 146 f"({'|'.join(re.escape(x) for x in sorted(operators, key=lambda x: -len(x)))})" 147) 148 149ignored_tokens = frozenset( 150 [ 151 TOKEN_COMMENT_BEGIN, 152 TOKEN_COMMENT, 153 TOKEN_COMMENT_END, 154 TOKEN_WHITESPACE, 155 TOKEN_LINECOMMENT_BEGIN, 156 TOKEN_LINECOMMENT_END, 157 TOKEN_LINECOMMENT, 158 ] 159) 160ignore_if_empty = frozenset( 161 [TOKEN_WHITESPACE, TOKEN_DATA, TOKEN_COMMENT, TOKEN_LINECOMMENT] 162) 163 164 165def _describe_token_type(token_type: str) -> str: 166 if token_type in reverse_operators: 167 return reverse_operators[token_type] 168 169 return { 170 TOKEN_COMMENT_BEGIN: "begin of comment", 171 TOKEN_COMMENT_END: "end of comment", 172 TOKEN_COMMENT: "comment", 173 TOKEN_LINECOMMENT: "comment", 174 TOKEN_BLOCK_BEGIN: "begin of statement block", 175 TOKEN_BLOCK_END: "end of statement block", 176 TOKEN_VARIABLE_BEGIN: "begin of print statement", 177 TOKEN_VARIABLE_END: "end of print statement", 178 TOKEN_LINESTATEMENT_BEGIN: "begin of line statement", 179 TOKEN_LINESTATEMENT_END: "end of line statement", 180 TOKEN_DATA: "template data / text", 181 TOKEN_EOF: "end of template", 182 }.get(token_type, token_type) 183 184 185def describe_token(token: "Token") -> str: 186 """Returns a description of the token.""" 187 if token.type == TOKEN_NAME: 188 return token.value 189 190 return _describe_token_type(token.type) 191 192 193def describe_token_expr(expr: str) -> str: 194 """Like `describe_token` but for token expressions.""" 195 if ":" in expr: 196 type, value = expr.split(":", 1) 197 198 if type == TOKEN_NAME: 199 return value 200 else: 201 type = expr 202 203 return _describe_token_type(type) 204 205 206def count_newlines(value: str) -> int: 207 """Count the number of newline characters in the string. This is 208 useful for extensions that filter a stream. 209 """ 210 return len(newline_re.findall(value)) 211 212 213def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]: 214 """Compiles all the rules from the environment into a list of rules.""" 215 e = re.escape 216 rules = [ 217 ( 218 len(environment.comment_start_string), 219 TOKEN_COMMENT_BEGIN, 220 e(environment.comment_start_string), 221 ), 222 ( 223 len(environment.block_start_string), 224 TOKEN_BLOCK_BEGIN, 225 e(environment.block_start_string), 226 ), 227 ( 228 len(environment.variable_start_string), 229 TOKEN_VARIABLE_BEGIN, 230 e(environment.variable_start_string), 231 ), 232 ] 233 234 if environment.line_statement_prefix is not None: 235 rules.append( 236 ( 237 len(environment.line_statement_prefix), 238 TOKEN_LINESTATEMENT_BEGIN, 239 r"^[ \t\v]*" + e(environment.line_statement_prefix), 240 ) 241 ) 242 if environment.line_comment_prefix is not None: 243 rules.append( 244 ( 245 len(environment.line_comment_prefix), 246 TOKEN_LINECOMMENT_BEGIN, 247 r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix), 248 ) 249 ) 250 251 return [x[1:] for x in sorted(rules, reverse=True)] 252 253 254class Failure: 255 """Class that raises a `TemplateSyntaxError` if called. 256 Used by the `Lexer` to specify known errors. 257 """ 258 259 def __init__( 260 self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError 261 ) -> None: 262 self.message = message 263 self.error_class = cls 264 265 def __call__(self, lineno: int, filename: t.Optional[str]) -> "te.NoReturn": 266 raise self.error_class(self.message, lineno, filename) 267 268 269class Token(t.NamedTuple): 270 lineno: int 271 type: str 272 value: str 273 274 def __str__(self) -> str: 275 return describe_token(self) 276 277 def test(self, expr: str) -> bool: 278 """Test a token against a token expression. This can either be a 279 token type or ``'token_type:token_value'``. This can only test 280 against string values and types. 281 """ 282 # here we do a regular string equality check as test_any is usually 283 # passed an iterable of not interned strings. 284 if self.type == expr: 285 return True 286 287 if ":" in expr: 288 return expr.split(":", 1) == [self.type, self.value] 289 290 return False 291 292 def test_any(self, *iterable: str) -> bool: 293 """Test against multiple token expressions.""" 294 return any(self.test(expr) for expr in iterable) 295 296 297class TokenStreamIterator: 298 """The iterator for tokenstreams. Iterate over the stream 299 until the eof token is reached. 300 """ 301 302 def __init__(self, stream: "TokenStream") -> None: 303 self.stream = stream 304 305 def __iter__(self) -> "TokenStreamIterator": 306 return self 307 308 def __next__(self) -> Token: 309 token = self.stream.current 310 311 if token.type is TOKEN_EOF: 312 self.stream.close() 313 raise StopIteration 314 315 next(self.stream) 316 return token 317 318 319class TokenStream: 320 """A token stream is an iterable that yields :class:`Token`\\s. The 321 parser however does not iterate over it but calls :meth:`next` to go 322 one token ahead. The current active token is stored as :attr:`current`. 323 """ 324 325 def __init__( 326 self, 327 generator: t.Iterable[Token], 328 name: t.Optional[str], 329 filename: t.Optional[str], 330 ): 331 self._iter = iter(generator) 332 self._pushed: te.Deque[Token] = deque() 333 self.name = name 334 self.filename = filename 335 self.closed = False 336 self.current = Token(1, TOKEN_INITIAL, "") 337 next(self) 338 339 def __iter__(self) -> TokenStreamIterator: 340 return TokenStreamIterator(self) 341 342 def __bool__(self) -> bool: 343 return bool(self._pushed) or self.current.type is not TOKEN_EOF 344 345 @property 346 def eos(self) -> bool: 347 """Are we at the end of the stream?""" 348 return not self 349 350 def push(self, token: Token) -> None: 351 """Push a token back to the stream.""" 352 self._pushed.append(token) 353 354 def look(self) -> Token: 355 """Look at the next token.""" 356 old_token = next(self) 357 result = self.current 358 self.push(result) 359 self.current = old_token 360 return result 361 362 def skip(self, n: int = 1) -> None: 363 """Got n tokens ahead.""" 364 for _ in range(n): 365 next(self) 366 367 def next_if(self, expr: str) -> t.Optional[Token]: 368 """Perform the token test and return the token if it matched. 369 Otherwise the return value is `None`. 370 """ 371 if self.current.test(expr): 372 return next(self) 373 374 return None 375 376 def skip_if(self, expr: str) -> bool: 377 """Like :meth:`next_if` but only returns `True` or `False`.""" 378 return self.next_if(expr) is not None 379 380 def __next__(self) -> Token: 381 """Go one token ahead and return the old one. 382 383 Use the built-in :func:`next` instead of calling this directly. 384 """ 385 rv = self.current 386 387 if self._pushed: 388 self.current = self._pushed.popleft() 389 elif self.current.type is not TOKEN_EOF: 390 try: 391 self.current = next(self._iter) 392 except StopIteration: 393 self.close() 394 395 return rv 396 397 def close(self) -> None: 398 """Close the stream.""" 399 self.current = Token(self.current.lineno, TOKEN_EOF, "") 400 self._iter = iter(()) 401 self.closed = True 402 403 def expect(self, expr: str) -> Token: 404 """Expect a given token type and return it. This accepts the same 405 argument as :meth:`jinja2.lexer.Token.test`. 406 """ 407 if not self.current.test(expr): 408 expr = describe_token_expr(expr) 409 410 if self.current.type is TOKEN_EOF: 411 raise TemplateSyntaxError( 412 f"unexpected end of template, expected {expr!r}.", 413 self.current.lineno, 414 self.name, 415 self.filename, 416 ) 417 418 raise TemplateSyntaxError( 419 f"expected token {expr!r}, got {describe_token(self.current)!r}", 420 self.current.lineno, 421 self.name, 422 self.filename, 423 ) 424 425 return next(self) 426 427 428def get_lexer(environment: "Environment") -> "Lexer": 429 """Return a lexer which is probably cached.""" 430 key = ( 431 environment.block_start_string, 432 environment.block_end_string, 433 environment.variable_start_string, 434 environment.variable_end_string, 435 environment.comment_start_string, 436 environment.comment_end_string, 437 environment.line_statement_prefix, 438 environment.line_comment_prefix, 439 environment.trim_blocks, 440 environment.lstrip_blocks, 441 environment.newline_sequence, 442 environment.keep_trailing_newline, 443 ) 444 lexer = _lexer_cache.get(key) 445 446 if lexer is None: 447 _lexer_cache[key] = lexer = Lexer(environment) 448 449 return lexer 450 451 452class OptionalLStrip(tuple): # type: ignore[type-arg] 453 """A special tuple for marking a point in the state that can have 454 lstrip applied. 455 """ 456 457 __slots__ = () 458 459 # Even though it looks like a no-op, creating instances fails 460 # without this. 461 def __new__(cls, *members, **kwargs): # type: ignore 462 return super().__new__(cls, members) 463 464 465class _Rule(t.NamedTuple): 466 pattern: t.Pattern[str] 467 tokens: t.Union[str, t.Tuple[str, ...], t.Tuple[Failure]] 468 command: t.Optional[str] 469 470 471class Lexer: 472 """Class that implements a lexer for a given environment. Automatically 473 created by the environment class, usually you don't have to do that. 474 475 Note that the lexer is not automatically bound to an environment. 476 Multiple environments can share the same lexer. 477 """ 478 479 def __init__(self, environment: "Environment") -> None: 480 # shortcuts 481 e = re.escape 482 483 def c(x: str) -> t.Pattern[str]: 484 return re.compile(x, re.M | re.S) 485 486 # lexing rules for tags 487 tag_rules: t.List[_Rule] = [ 488 _Rule(whitespace_re, TOKEN_WHITESPACE, None), 489 _Rule(float_re, TOKEN_FLOAT, None), 490 _Rule(integer_re, TOKEN_INTEGER, None), 491 _Rule(name_re, TOKEN_NAME, None), 492 _Rule(string_re, TOKEN_STRING, None), 493 _Rule(operator_re, TOKEN_OPERATOR, None), 494 ] 495 496 # assemble the root lexing rule. because "|" is ungreedy 497 # we have to sort by length so that the lexer continues working 498 # as expected when we have parsing rules like <% for block and 499 # <%= for variables. (if someone wants asp like syntax) 500 # variables are just part of the rules if variable processing 501 # is required. 502 root_tag_rules = compile_rules(environment) 503 504 block_start_re = e(environment.block_start_string) 505 block_end_re = e(environment.block_end_string) 506 comment_end_re = e(environment.comment_end_string) 507 variable_end_re = e(environment.variable_end_string) 508 509 # block suffix if trimming is enabled 510 block_suffix_re = "\\n?" if environment.trim_blocks else "" 511 512 self.lstrip_blocks = environment.lstrip_blocks 513 514 self.newline_sequence = environment.newline_sequence 515 self.keep_trailing_newline = environment.keep_trailing_newline 516 517 root_raw_re = ( 518 rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*" 519 rf"(?:\-{block_end_re}\s*|{block_end_re}))" 520 ) 521 root_parts_re = "|".join( 522 [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules] 523 ) 524 525 # global lexing rules 526 self.rules: t.Dict[str, t.List[_Rule]] = { 527 "root": [ 528 # directives 529 _Rule( 530 c(rf"(.*?)(?:{root_parts_re})"), 531 OptionalLStrip(TOKEN_DATA, "#bygroup"), # type: ignore 532 "#bygroup", 533 ), 534 # data 535 _Rule(c(".+"), TOKEN_DATA, None), 536 ], 537 # comments 538 TOKEN_COMMENT_BEGIN: [ 539 _Rule( 540 c( 541 rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*" 542 rf"|{comment_end_re}{block_suffix_re}))" 543 ), 544 (TOKEN_COMMENT, TOKEN_COMMENT_END), 545 "#pop", 546 ), 547 _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None), 548 ], 549 # blocks 550 TOKEN_BLOCK_BEGIN: [ 551 _Rule( 552 c( 553 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 554 rf"|{block_end_re}{block_suffix_re})" 555 ), 556 TOKEN_BLOCK_END, 557 "#pop", 558 ), 559 ] 560 + tag_rules, 561 # variables 562 TOKEN_VARIABLE_BEGIN: [ 563 _Rule( 564 c(rf"\-{variable_end_re}\s*|{variable_end_re}"), 565 TOKEN_VARIABLE_END, 566 "#pop", 567 ) 568 ] 569 + tag_rules, 570 # raw block 571 TOKEN_RAW_BEGIN: [ 572 _Rule( 573 c( 574 rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*" 575 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 576 rf"|{block_end_re}{block_suffix_re}))" 577 ), 578 OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), # type: ignore 579 "#pop", 580 ), 581 _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None), 582 ], 583 # line statements 584 TOKEN_LINESTATEMENT_BEGIN: [ 585 _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop") 586 ] 587 + tag_rules, 588 # line comments 589 TOKEN_LINECOMMENT_BEGIN: [ 590 _Rule( 591 c(r"(.*?)()(?=\n|$)"), 592 (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END), 593 "#pop", 594 ) 595 ], 596 } 597 598 def _normalize_newlines(self, value: str) -> str: 599 """Replace all newlines with the configured sequence in strings 600 and template data. 601 """ 602 return newline_re.sub(self.newline_sequence, value) 603 604 def tokenize( 605 self, 606 source: str, 607 name: t.Optional[str] = None, 608 filename: t.Optional[str] = None, 609 state: t.Optional[str] = None, 610 ) -> TokenStream: 611 """Calls tokeniter + tokenize and wraps it in a token stream.""" 612 stream = self.tokeniter(source, name, filename, state) 613 return TokenStream(self.wrap(stream, name, filename), name, filename) 614 615 def wrap( 616 self, 617 stream: t.Iterable[t.Tuple[int, str, str]], 618 name: t.Optional[str] = None, 619 filename: t.Optional[str] = None, 620 ) -> t.Iterator[Token]: 621 """This is called with the stream as returned by `tokenize` and wraps 622 every token in a :class:`Token` and converts the value. 623 """ 624 for lineno, token, value_str in stream: 625 if token in ignored_tokens: 626 continue 627 628 value: t.Any = value_str 629 630 if token == TOKEN_LINESTATEMENT_BEGIN: 631 token = TOKEN_BLOCK_BEGIN 632 elif token == TOKEN_LINESTATEMENT_END: 633 token = TOKEN_BLOCK_END 634 # we are not interested in those tokens in the parser 635 elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END): 636 continue 637 elif token == TOKEN_DATA: 638 value = self._normalize_newlines(value_str) 639 elif token == "keyword": 640 token = value_str 641 elif token == TOKEN_NAME: 642 value = value_str 643 644 if not value.isidentifier(): 645 raise TemplateSyntaxError( 646 "Invalid character in identifier", lineno, name, filename 647 ) 648 elif token == TOKEN_STRING: 649 # try to unescape string 650 try: 651 value = ( 652 self._normalize_newlines(value_str[1:-1]) 653 .encode("ascii", "backslashreplace") 654 .decode("unicode-escape") 655 ) 656 except Exception as e: 657 msg = str(e).split(":")[-1].strip() 658 raise TemplateSyntaxError(msg, lineno, name, filename) from e 659 elif token == TOKEN_INTEGER: 660 value = int(value_str.replace("_", ""), 0) 661 elif token == TOKEN_FLOAT: 662 # remove all "_" first to support more Python versions 663 value = literal_eval(value_str.replace("_", "")) 664 elif token == TOKEN_OPERATOR: 665 token = operators[value_str] 666 667 yield Token(lineno, token, value) 668 669 def tokeniter( 670 self, 671 source: str, 672 name: t.Optional[str], 673 filename: t.Optional[str] = None, 674 state: t.Optional[str] = None, 675 ) -> t.Iterator[t.Tuple[int, str, str]]: 676 """This method tokenizes the text and returns the tokens in a 677 generator. Use this method if you just want to tokenize a template. 678 679 .. versionchanged:: 3.0 680 Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line 681 breaks. 682 """ 683 lines = newline_re.split(source)[::2] 684 685 if not self.keep_trailing_newline and lines[-1] == "": 686 del lines[-1] 687 688 source = "\n".join(lines) 689 pos = 0 690 lineno = 1 691 stack = ["root"] 692 693 if state is not None and state != "root": 694 assert state in ("variable", "block"), "invalid state" 695 stack.append(state + "_begin") 696 697 statetokens = self.rules[stack[-1]] 698 source_length = len(source) 699 balancing_stack: t.List[str] = [] 700 newlines_stripped = 0 701 line_starting = True 702 703 while True: 704 # tokenizer loop 705 for regex, tokens, new_state in statetokens: 706 m = regex.match(source, pos) 707 708 # if no match we try again with the next rule 709 if m is None: 710 continue 711 712 # we only match blocks and variables if braces / parentheses 713 # are balanced. continue parsing with the lower rule which 714 # is the operator rule. do this only if the end tags look 715 # like operators 716 if balancing_stack and tokens in ( 717 TOKEN_VARIABLE_END, 718 TOKEN_BLOCK_END, 719 TOKEN_LINESTATEMENT_END, 720 ): 721 continue 722 723 # tuples support more options 724 if isinstance(tokens, tuple): 725 groups: t.Sequence[str] = m.groups() 726 727 if isinstance(tokens, OptionalLStrip): 728 # Rule supports lstrip. Match will look like 729 # text, block type, whitespace control, type, control, ... 730 text = groups[0] 731 # Skipping the text and first type, every other group is the 732 # whitespace control for each type. One of the groups will be 733 # -, +, or empty string instead of None. 734 strip_sign = next(g for g in groups[2::2] if g is not None) 735 736 if strip_sign == "-": 737 # Strip all whitespace between the text and the tag. 738 stripped = text.rstrip() 739 newlines_stripped = text[len(stripped) :].count("\n") 740 groups = [stripped, *groups[1:]] 741 elif ( 742 # Not marked for preserving whitespace. 743 strip_sign != "+" 744 # lstrip is enabled. 745 and self.lstrip_blocks 746 # Not a variable expression. 747 and not m.groupdict().get(TOKEN_VARIABLE_BEGIN) 748 ): 749 # The start of text between the last newline and the tag. 750 l_pos = text.rfind("\n") + 1 751 752 if l_pos > 0 or line_starting: 753 # If there's only whitespace between the newline and the 754 # tag, strip it. 755 if whitespace_re.fullmatch(text, l_pos): 756 groups = [text[:l_pos], *groups[1:]] 757 758 for idx, token in enumerate(tokens): 759 # failure group 760 if isinstance(token, Failure): 761 raise token(lineno, filename) 762 # bygroup is a bit more complex, in that case we 763 # yield for the current token the first named 764 # group that matched 765 elif token == "#bygroup": 766 for key, value in m.groupdict().items(): 767 if value is not None: 768 yield lineno, key, value 769 lineno += value.count("\n") 770 break 771 else: 772 raise RuntimeError( 773 f"{regex!r} wanted to resolve the token dynamically" 774 " but no group matched" 775 ) 776 # normal group 777 else: 778 data = groups[idx] 779 780 if data or token not in ignore_if_empty: 781 yield lineno, token, data # type: ignore[misc] 782 783 lineno += data.count("\n") + newlines_stripped 784 newlines_stripped = 0 785 786 # strings as token just are yielded as it. 787 else: 788 data = m.group() 789 790 # update brace/parentheses balance 791 if tokens == TOKEN_OPERATOR: 792 if data == "{": 793 balancing_stack.append("}") 794 elif data == "(": 795 balancing_stack.append(")") 796 elif data == "[": 797 balancing_stack.append("]") 798 elif data in ("}", ")", "]"): 799 if not balancing_stack: 800 raise TemplateSyntaxError( 801 f"unexpected '{data}'", lineno, name, filename 802 ) 803 804 expected_op = balancing_stack.pop() 805 806 if expected_op != data: 807 raise TemplateSyntaxError( 808 f"unexpected '{data}', expected '{expected_op}'", 809 lineno, 810 name, 811 filename, 812 ) 813 814 # yield items 815 if data or tokens not in ignore_if_empty: 816 yield lineno, tokens, data 817 818 lineno += data.count("\n") 819 820 line_starting = m.group()[-1:] == "\n" 821 # fetch new position into new variable so that we can check 822 # if there is a internal parsing error which would result 823 # in an infinite loop 824 pos2 = m.end() 825 826 # handle state changes 827 if new_state is not None: 828 # remove the uppermost state 829 if new_state == "#pop": 830 stack.pop() 831 # resolve the new state by group checking 832 elif new_state == "#bygroup": 833 for key, value in m.groupdict().items(): 834 if value is not None: 835 stack.append(key) 836 break 837 else: 838 raise RuntimeError( 839 f"{regex!r} wanted to resolve the new state dynamically" 840 f" but no group matched" 841 ) 842 # direct state name given 843 else: 844 stack.append(new_state) 845 846 statetokens = self.rules[stack[-1]] 847 # we are still at the same position and no stack change. 848 # this means a loop without break condition, avoid that and 849 # raise error 850 elif pos2 == pos: 851 raise RuntimeError( 852 f"{regex!r} yielded empty string without stack change" 853 ) 854 855 # publish new function and start again 856 pos = pos2 857 break 858 # if loop terminated without break we haven't found a single match 859 # either we are at the end of the file or we have a problem 860 else: 861 # end of text 862 if pos >= source_length: 863 return 864 865 # something went wrong 866 raise TemplateSyntaxError( 867 f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename 868 )
186def describe_token(token: "Token") -> str: 187 """Returns a description of the token.""" 188 if token.type == TOKEN_NAME: 189 return token.value 190 191 return _describe_token_type(token.type)
Returns a description of the token.
194def describe_token_expr(expr: str) -> str: 195 """Like `describe_token` but for token expressions.""" 196 if ":" in expr: 197 type, value = expr.split(":", 1) 198 199 if type == TOKEN_NAME: 200 return value 201 else: 202 type = expr 203 204 return _describe_token_type(type)
Like describe_token
but for token expressions.
207def count_newlines(value: str) -> int: 208 """Count the number of newline characters in the string. This is 209 useful for extensions that filter a stream. 210 """ 211 return len(newline_re.findall(value))
Count the number of newline characters in the string. This is useful for extensions that filter a stream.
214def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]: 215 """Compiles all the rules from the environment into a list of rules.""" 216 e = re.escape 217 rules = [ 218 ( 219 len(environment.comment_start_string), 220 TOKEN_COMMENT_BEGIN, 221 e(environment.comment_start_string), 222 ), 223 ( 224 len(environment.block_start_string), 225 TOKEN_BLOCK_BEGIN, 226 e(environment.block_start_string), 227 ), 228 ( 229 len(environment.variable_start_string), 230 TOKEN_VARIABLE_BEGIN, 231 e(environment.variable_start_string), 232 ), 233 ] 234 235 if environment.line_statement_prefix is not None: 236 rules.append( 237 ( 238 len(environment.line_statement_prefix), 239 TOKEN_LINESTATEMENT_BEGIN, 240 r"^[ \t\v]*" + e(environment.line_statement_prefix), 241 ) 242 ) 243 if environment.line_comment_prefix is not None: 244 rules.append( 245 ( 246 len(environment.line_comment_prefix), 247 TOKEN_LINECOMMENT_BEGIN, 248 r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix), 249 ) 250 ) 251 252 return [x[1:] for x in sorted(rules, reverse=True)]
Compiles all the rules from the environment into a list of rules.
255class Failure: 256 """Class that raises a `TemplateSyntaxError` if called. 257 Used by the `Lexer` to specify known errors. 258 """ 259 260 def __init__( 261 self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError 262 ) -> None: 263 self.message = message 264 self.error_class = cls 265 266 def __call__(self, lineno: int, filename: t.Optional[str]) -> "te.NoReturn": 267 raise self.error_class(self.message, lineno, filename)
Class that raises a TemplateSyntaxError
if called.
Used by the Lexer
to specify known errors.
270class Token(t.NamedTuple): 271 lineno: int 272 type: str 273 value: str 274 275 def __str__(self) -> str: 276 return describe_token(self) 277 278 def test(self, expr: str) -> bool: 279 """Test a token against a token expression. This can either be a 280 token type or ``'token_type:token_value'``. This can only test 281 against string values and types. 282 """ 283 # here we do a regular string equality check as test_any is usually 284 # passed an iterable of not interned strings. 285 if self.type == expr: 286 return True 287 288 if ":" in expr: 289 return expr.split(":", 1) == [self.type, self.value] 290 291 return False 292 293 def test_any(self, *iterable: str) -> bool: 294 """Test against multiple token expressions.""" 295 return any(self.test(expr) for expr in iterable)
Token(lineno, type, value)
278 def test(self, expr: str) -> bool: 279 """Test a token against a token expression. This can either be a 280 token type or ``'token_type:token_value'``. This can only test 281 against string values and types. 282 """ 283 # here we do a regular string equality check as test_any is usually 284 # passed an iterable of not interned strings. 285 if self.type == expr: 286 return True 287 288 if ":" in expr: 289 return expr.split(":", 1) == [self.type, self.value] 290 291 return False
Test a token against a token expression. This can either be a
token type or 'token_type:token_value'
. This can only test
against string values and types.
293 def test_any(self, *iterable: str) -> bool: 294 """Test against multiple token expressions.""" 295 return any(self.test(expr) for expr in iterable)
Test against multiple token expressions.
Inherited Members
- builtins.tuple
- index
- count
298class TokenStreamIterator: 299 """The iterator for tokenstreams. Iterate over the stream 300 until the eof token is reached. 301 """ 302 303 def __init__(self, stream: "TokenStream") -> None: 304 self.stream = stream 305 306 def __iter__(self) -> "TokenStreamIterator": 307 return self 308 309 def __next__(self) -> Token: 310 token = self.stream.current 311 312 if token.type is TOKEN_EOF: 313 self.stream.close() 314 raise StopIteration 315 316 next(self.stream) 317 return token
The iterator for tokenstreams. Iterate over the stream until the eof token is reached.
320class TokenStream: 321 """A token stream is an iterable that yields :class:`Token`\\s. The 322 parser however does not iterate over it but calls :meth:`next` to go 323 one token ahead. The current active token is stored as :attr:`current`. 324 """ 325 326 def __init__( 327 self, 328 generator: t.Iterable[Token], 329 name: t.Optional[str], 330 filename: t.Optional[str], 331 ): 332 self._iter = iter(generator) 333 self._pushed: te.Deque[Token] = deque() 334 self.name = name 335 self.filename = filename 336 self.closed = False 337 self.current = Token(1, TOKEN_INITIAL, "") 338 next(self) 339 340 def __iter__(self) -> TokenStreamIterator: 341 return TokenStreamIterator(self) 342 343 def __bool__(self) -> bool: 344 return bool(self._pushed) or self.current.type is not TOKEN_EOF 345 346 @property 347 def eos(self) -> bool: 348 """Are we at the end of the stream?""" 349 return not self 350 351 def push(self, token: Token) -> None: 352 """Push a token back to the stream.""" 353 self._pushed.append(token) 354 355 def look(self) -> Token: 356 """Look at the next token.""" 357 old_token = next(self) 358 result = self.current 359 self.push(result) 360 self.current = old_token 361 return result 362 363 def skip(self, n: int = 1) -> None: 364 """Got n tokens ahead.""" 365 for _ in range(n): 366 next(self) 367 368 def next_if(self, expr: str) -> t.Optional[Token]: 369 """Perform the token test and return the token if it matched. 370 Otherwise the return value is `None`. 371 """ 372 if self.current.test(expr): 373 return next(self) 374 375 return None 376 377 def skip_if(self, expr: str) -> bool: 378 """Like :meth:`next_if` but only returns `True` or `False`.""" 379 return self.next_if(expr) is not None 380 381 def __next__(self) -> Token: 382 """Go one token ahead and return the old one. 383 384 Use the built-in :func:`next` instead of calling this directly. 385 """ 386 rv = self.current 387 388 if self._pushed: 389 self.current = self._pushed.popleft() 390 elif self.current.type is not TOKEN_EOF: 391 try: 392 self.current = next(self._iter) 393 except StopIteration: 394 self.close() 395 396 return rv 397 398 def close(self) -> None: 399 """Close the stream.""" 400 self.current = Token(self.current.lineno, TOKEN_EOF, "") 401 self._iter = iter(()) 402 self.closed = True 403 404 def expect(self, expr: str) -> Token: 405 """Expect a given token type and return it. This accepts the same 406 argument as :meth:`jinja2.lexer.Token.test`. 407 """ 408 if not self.current.test(expr): 409 expr = describe_token_expr(expr) 410 411 if self.current.type is TOKEN_EOF: 412 raise TemplateSyntaxError( 413 f"unexpected end of template, expected {expr!r}.", 414 self.current.lineno, 415 self.name, 416 self.filename, 417 ) 418 419 raise TemplateSyntaxError( 420 f"expected token {expr!r}, got {describe_token(self.current)!r}", 421 self.current.lineno, 422 self.name, 423 self.filename, 424 ) 425 426 return next(self)
A token stream is an iterable that yields Token
\s. The
parser however does not iterate over it but calls next()
to go
one token ahead. The current active token is stored as current
.
326 def __init__( 327 self, 328 generator: t.Iterable[Token], 329 name: t.Optional[str], 330 filename: t.Optional[str], 331 ): 332 self._iter = iter(generator) 333 self._pushed: te.Deque[Token] = deque() 334 self.name = name 335 self.filename = filename 336 self.closed = False 337 self.current = Token(1, TOKEN_INITIAL, "") 338 next(self)
346 @property 347 def eos(self) -> bool: 348 """Are we at the end of the stream?""" 349 return not self
Are we at the end of the stream?
351 def push(self, token: Token) -> None: 352 """Push a token back to the stream.""" 353 self._pushed.append(token)
Push a token back to the stream.
355 def look(self) -> Token: 356 """Look at the next token.""" 357 old_token = next(self) 358 result = self.current 359 self.push(result) 360 self.current = old_token 361 return result
Look at the next token.
363 def skip(self, n: int = 1) -> None: 364 """Got n tokens ahead.""" 365 for _ in range(n): 366 next(self)
Got n tokens ahead.
368 def next_if(self, expr: str) -> t.Optional[Token]: 369 """Perform the token test and return the token if it matched. 370 Otherwise the return value is `None`. 371 """ 372 if self.current.test(expr): 373 return next(self) 374 375 return None
Perform the token test and return the token if it matched.
Otherwise the return value is None
.
377 def skip_if(self, expr: str) -> bool: 378 """Like :meth:`next_if` but only returns `True` or `False`.""" 379 return self.next_if(expr) is not None
Like next_if()
but only returns True
or False
.
398 def close(self) -> None: 399 """Close the stream.""" 400 self.current = Token(self.current.lineno, TOKEN_EOF, "") 401 self._iter = iter(()) 402 self.closed = True
Close the stream.
404 def expect(self, expr: str) -> Token: 405 """Expect a given token type and return it. This accepts the same 406 argument as :meth:`jinja2.lexer.Token.test`. 407 """ 408 if not self.current.test(expr): 409 expr = describe_token_expr(expr) 410 411 if self.current.type is TOKEN_EOF: 412 raise TemplateSyntaxError( 413 f"unexpected end of template, expected {expr!r}.", 414 self.current.lineno, 415 self.name, 416 self.filename, 417 ) 418 419 raise TemplateSyntaxError( 420 f"expected token {expr!r}, got {describe_token(self.current)!r}", 421 self.current.lineno, 422 self.name, 423 self.filename, 424 ) 425 426 return next(self)
Expect a given token type and return it. This accepts the same
argument as jinja2.lexer.Token.test()
.
429def get_lexer(environment: "Environment") -> "Lexer": 430 """Return a lexer which is probably cached.""" 431 key = ( 432 environment.block_start_string, 433 environment.block_end_string, 434 environment.variable_start_string, 435 environment.variable_end_string, 436 environment.comment_start_string, 437 environment.comment_end_string, 438 environment.line_statement_prefix, 439 environment.line_comment_prefix, 440 environment.trim_blocks, 441 environment.lstrip_blocks, 442 environment.newline_sequence, 443 environment.keep_trailing_newline, 444 ) 445 lexer = _lexer_cache.get(key) 446 447 if lexer is None: 448 _lexer_cache[key] = lexer = Lexer(environment) 449 450 return lexer
Return a lexer which is probably cached.
453class OptionalLStrip(tuple): # type: ignore[type-arg] 454 """A special tuple for marking a point in the state that can have 455 lstrip applied. 456 """ 457 458 __slots__ = () 459 460 # Even though it looks like a no-op, creating instances fails 461 # without this. 462 def __new__(cls, *members, **kwargs): # type: ignore 463 return super().__new__(cls, members)
A special tuple for marking a point in the state that can have lstrip applied.
Inherited Members
- builtins.tuple
- index
- count
472class Lexer: 473 """Class that implements a lexer for a given environment. Automatically 474 created by the environment class, usually you don't have to do that. 475 476 Note that the lexer is not automatically bound to an environment. 477 Multiple environments can share the same lexer. 478 """ 479 480 def __init__(self, environment: "Environment") -> None: 481 # shortcuts 482 e = re.escape 483 484 def c(x: str) -> t.Pattern[str]: 485 return re.compile(x, re.M | re.S) 486 487 # lexing rules for tags 488 tag_rules: t.List[_Rule] = [ 489 _Rule(whitespace_re, TOKEN_WHITESPACE, None), 490 _Rule(float_re, TOKEN_FLOAT, None), 491 _Rule(integer_re, TOKEN_INTEGER, None), 492 _Rule(name_re, TOKEN_NAME, None), 493 _Rule(string_re, TOKEN_STRING, None), 494 _Rule(operator_re, TOKEN_OPERATOR, None), 495 ] 496 497 # assemble the root lexing rule. because "|" is ungreedy 498 # we have to sort by length so that the lexer continues working 499 # as expected when we have parsing rules like <% for block and 500 # <%= for variables. (if someone wants asp like syntax) 501 # variables are just part of the rules if variable processing 502 # is required. 503 root_tag_rules = compile_rules(environment) 504 505 block_start_re = e(environment.block_start_string) 506 block_end_re = e(environment.block_end_string) 507 comment_end_re = e(environment.comment_end_string) 508 variable_end_re = e(environment.variable_end_string) 509 510 # block suffix if trimming is enabled 511 block_suffix_re = "\\n?" if environment.trim_blocks else "" 512 513 self.lstrip_blocks = environment.lstrip_blocks 514 515 self.newline_sequence = environment.newline_sequence 516 self.keep_trailing_newline = environment.keep_trailing_newline 517 518 root_raw_re = ( 519 rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*" 520 rf"(?:\-{block_end_re}\s*|{block_end_re}))" 521 ) 522 root_parts_re = "|".join( 523 [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules] 524 ) 525 526 # global lexing rules 527 self.rules: t.Dict[str, t.List[_Rule]] = { 528 "root": [ 529 # directives 530 _Rule( 531 c(rf"(.*?)(?:{root_parts_re})"), 532 OptionalLStrip(TOKEN_DATA, "#bygroup"), # type: ignore 533 "#bygroup", 534 ), 535 # data 536 _Rule(c(".+"), TOKEN_DATA, None), 537 ], 538 # comments 539 TOKEN_COMMENT_BEGIN: [ 540 _Rule( 541 c( 542 rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*" 543 rf"|{comment_end_re}{block_suffix_re}))" 544 ), 545 (TOKEN_COMMENT, TOKEN_COMMENT_END), 546 "#pop", 547 ), 548 _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None), 549 ], 550 # blocks 551 TOKEN_BLOCK_BEGIN: [ 552 _Rule( 553 c( 554 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 555 rf"|{block_end_re}{block_suffix_re})" 556 ), 557 TOKEN_BLOCK_END, 558 "#pop", 559 ), 560 ] 561 + tag_rules, 562 # variables 563 TOKEN_VARIABLE_BEGIN: [ 564 _Rule( 565 c(rf"\-{variable_end_re}\s*|{variable_end_re}"), 566 TOKEN_VARIABLE_END, 567 "#pop", 568 ) 569 ] 570 + tag_rules, 571 # raw block 572 TOKEN_RAW_BEGIN: [ 573 _Rule( 574 c( 575 rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*" 576 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 577 rf"|{block_end_re}{block_suffix_re}))" 578 ), 579 OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), # type: ignore 580 "#pop", 581 ), 582 _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None), 583 ], 584 # line statements 585 TOKEN_LINESTATEMENT_BEGIN: [ 586 _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop") 587 ] 588 + tag_rules, 589 # line comments 590 TOKEN_LINECOMMENT_BEGIN: [ 591 _Rule( 592 c(r"(.*?)()(?=\n|$)"), 593 (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END), 594 "#pop", 595 ) 596 ], 597 } 598 599 def _normalize_newlines(self, value: str) -> str: 600 """Replace all newlines with the configured sequence in strings 601 and template data. 602 """ 603 return newline_re.sub(self.newline_sequence, value) 604 605 def tokenize( 606 self, 607 source: str, 608 name: t.Optional[str] = None, 609 filename: t.Optional[str] = None, 610 state: t.Optional[str] = None, 611 ) -> TokenStream: 612 """Calls tokeniter + tokenize and wraps it in a token stream.""" 613 stream = self.tokeniter(source, name, filename, state) 614 return TokenStream(self.wrap(stream, name, filename), name, filename) 615 616 def wrap( 617 self, 618 stream: t.Iterable[t.Tuple[int, str, str]], 619 name: t.Optional[str] = None, 620 filename: t.Optional[str] = None, 621 ) -> t.Iterator[Token]: 622 """This is called with the stream as returned by `tokenize` and wraps 623 every token in a :class:`Token` and converts the value. 624 """ 625 for lineno, token, value_str in stream: 626 if token in ignored_tokens: 627 continue 628 629 value: t.Any = value_str 630 631 if token == TOKEN_LINESTATEMENT_BEGIN: 632 token = TOKEN_BLOCK_BEGIN 633 elif token == TOKEN_LINESTATEMENT_END: 634 token = TOKEN_BLOCK_END 635 # we are not interested in those tokens in the parser 636 elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END): 637 continue 638 elif token == TOKEN_DATA: 639 value = self._normalize_newlines(value_str) 640 elif token == "keyword": 641 token = value_str 642 elif token == TOKEN_NAME: 643 value = value_str 644 645 if not value.isidentifier(): 646 raise TemplateSyntaxError( 647 "Invalid character in identifier", lineno, name, filename 648 ) 649 elif token == TOKEN_STRING: 650 # try to unescape string 651 try: 652 value = ( 653 self._normalize_newlines(value_str[1:-1]) 654 .encode("ascii", "backslashreplace") 655 .decode("unicode-escape") 656 ) 657 except Exception as e: 658 msg = str(e).split(":")[-1].strip() 659 raise TemplateSyntaxError(msg, lineno, name, filename) from e 660 elif token == TOKEN_INTEGER: 661 value = int(value_str.replace("_", ""), 0) 662 elif token == TOKEN_FLOAT: 663 # remove all "_" first to support more Python versions 664 value = literal_eval(value_str.replace("_", "")) 665 elif token == TOKEN_OPERATOR: 666 token = operators[value_str] 667 668 yield Token(lineno, token, value) 669 670 def tokeniter( 671 self, 672 source: str, 673 name: t.Optional[str], 674 filename: t.Optional[str] = None, 675 state: t.Optional[str] = None, 676 ) -> t.Iterator[t.Tuple[int, str, str]]: 677 """This method tokenizes the text and returns the tokens in a 678 generator. Use this method if you just want to tokenize a template. 679 680 .. versionchanged:: 3.0 681 Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line 682 breaks. 683 """ 684 lines = newline_re.split(source)[::2] 685 686 if not self.keep_trailing_newline and lines[-1] == "": 687 del lines[-1] 688 689 source = "\n".join(lines) 690 pos = 0 691 lineno = 1 692 stack = ["root"] 693 694 if state is not None and state != "root": 695 assert state in ("variable", "block"), "invalid state" 696 stack.append(state + "_begin") 697 698 statetokens = self.rules[stack[-1]] 699 source_length = len(source) 700 balancing_stack: t.List[str] = [] 701 newlines_stripped = 0 702 line_starting = True 703 704 while True: 705 # tokenizer loop 706 for regex, tokens, new_state in statetokens: 707 m = regex.match(source, pos) 708 709 # if no match we try again with the next rule 710 if m is None: 711 continue 712 713 # we only match blocks and variables if braces / parentheses 714 # are balanced. continue parsing with the lower rule which 715 # is the operator rule. do this only if the end tags look 716 # like operators 717 if balancing_stack and tokens in ( 718 TOKEN_VARIABLE_END, 719 TOKEN_BLOCK_END, 720 TOKEN_LINESTATEMENT_END, 721 ): 722 continue 723 724 # tuples support more options 725 if isinstance(tokens, tuple): 726 groups: t.Sequence[str] = m.groups() 727 728 if isinstance(tokens, OptionalLStrip): 729 # Rule supports lstrip. Match will look like 730 # text, block type, whitespace control, type, control, ... 731 text = groups[0] 732 # Skipping the text and first type, every other group is the 733 # whitespace control for each type. One of the groups will be 734 # -, +, or empty string instead of None. 735 strip_sign = next(g for g in groups[2::2] if g is not None) 736 737 if strip_sign == "-": 738 # Strip all whitespace between the text and the tag. 739 stripped = text.rstrip() 740 newlines_stripped = text[len(stripped) :].count("\n") 741 groups = [stripped, *groups[1:]] 742 elif ( 743 # Not marked for preserving whitespace. 744 strip_sign != "+" 745 # lstrip is enabled. 746 and self.lstrip_blocks 747 # Not a variable expression. 748 and not m.groupdict().get(TOKEN_VARIABLE_BEGIN) 749 ): 750 # The start of text between the last newline and the tag. 751 l_pos = text.rfind("\n") + 1 752 753 if l_pos > 0 or line_starting: 754 # If there's only whitespace between the newline and the 755 # tag, strip it. 756 if whitespace_re.fullmatch(text, l_pos): 757 groups = [text[:l_pos], *groups[1:]] 758 759 for idx, token in enumerate(tokens): 760 # failure group 761 if isinstance(token, Failure): 762 raise token(lineno, filename) 763 # bygroup is a bit more complex, in that case we 764 # yield for the current token the first named 765 # group that matched 766 elif token == "#bygroup": 767 for key, value in m.groupdict().items(): 768 if value is not None: 769 yield lineno, key, value 770 lineno += value.count("\n") 771 break 772 else: 773 raise RuntimeError( 774 f"{regex!r} wanted to resolve the token dynamically" 775 " but no group matched" 776 ) 777 # normal group 778 else: 779 data = groups[idx] 780 781 if data or token not in ignore_if_empty: 782 yield lineno, token, data # type: ignore[misc] 783 784 lineno += data.count("\n") + newlines_stripped 785 newlines_stripped = 0 786 787 # strings as token just are yielded as it. 788 else: 789 data = m.group() 790 791 # update brace/parentheses balance 792 if tokens == TOKEN_OPERATOR: 793 if data == "{": 794 balancing_stack.append("}") 795 elif data == "(": 796 balancing_stack.append(")") 797 elif data == "[": 798 balancing_stack.append("]") 799 elif data in ("}", ")", "]"): 800 if not balancing_stack: 801 raise TemplateSyntaxError( 802 f"unexpected '{data}'", lineno, name, filename 803 ) 804 805 expected_op = balancing_stack.pop() 806 807 if expected_op != data: 808 raise TemplateSyntaxError( 809 f"unexpected '{data}', expected '{expected_op}'", 810 lineno, 811 name, 812 filename, 813 ) 814 815 # yield items 816 if data or tokens not in ignore_if_empty: 817 yield lineno, tokens, data 818 819 lineno += data.count("\n") 820 821 line_starting = m.group()[-1:] == "\n" 822 # fetch new position into new variable so that we can check 823 # if there is a internal parsing error which would result 824 # in an infinite loop 825 pos2 = m.end() 826 827 # handle state changes 828 if new_state is not None: 829 # remove the uppermost state 830 if new_state == "#pop": 831 stack.pop() 832 # resolve the new state by group checking 833 elif new_state == "#bygroup": 834 for key, value in m.groupdict().items(): 835 if value is not None: 836 stack.append(key) 837 break 838 else: 839 raise RuntimeError( 840 f"{regex!r} wanted to resolve the new state dynamically" 841 f" but no group matched" 842 ) 843 # direct state name given 844 else: 845 stack.append(new_state) 846 847 statetokens = self.rules[stack[-1]] 848 # we are still at the same position and no stack change. 849 # this means a loop without break condition, avoid that and 850 # raise error 851 elif pos2 == pos: 852 raise RuntimeError( 853 f"{regex!r} yielded empty string without stack change" 854 ) 855 856 # publish new function and start again 857 pos = pos2 858 break 859 # if loop terminated without break we haven't found a single match 860 # either we are at the end of the file or we have a problem 861 else: 862 # end of text 863 if pos >= source_length: 864 return 865 866 # something went wrong 867 raise TemplateSyntaxError( 868 f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename 869 )
Class that implements a lexer for a given environment. Automatically created by the environment class, usually you don't have to do that.
Note that the lexer is not automatically bound to an environment. Multiple environments can share the same lexer.
480 def __init__(self, environment: "Environment") -> None: 481 # shortcuts 482 e = re.escape 483 484 def c(x: str) -> t.Pattern[str]: 485 return re.compile(x, re.M | re.S) 486 487 # lexing rules for tags 488 tag_rules: t.List[_Rule] = [ 489 _Rule(whitespace_re, TOKEN_WHITESPACE, None), 490 _Rule(float_re, TOKEN_FLOAT, None), 491 _Rule(integer_re, TOKEN_INTEGER, None), 492 _Rule(name_re, TOKEN_NAME, None), 493 _Rule(string_re, TOKEN_STRING, None), 494 _Rule(operator_re, TOKEN_OPERATOR, None), 495 ] 496 497 # assemble the root lexing rule. because "|" is ungreedy 498 # we have to sort by length so that the lexer continues working 499 # as expected when we have parsing rules like <% for block and 500 # <%= for variables. (if someone wants asp like syntax) 501 # variables are just part of the rules if variable processing 502 # is required. 503 root_tag_rules = compile_rules(environment) 504 505 block_start_re = e(environment.block_start_string) 506 block_end_re = e(environment.block_end_string) 507 comment_end_re = e(environment.comment_end_string) 508 variable_end_re = e(environment.variable_end_string) 509 510 # block suffix if trimming is enabled 511 block_suffix_re = "\\n?" if environment.trim_blocks else "" 512 513 self.lstrip_blocks = environment.lstrip_blocks 514 515 self.newline_sequence = environment.newline_sequence 516 self.keep_trailing_newline = environment.keep_trailing_newline 517 518 root_raw_re = ( 519 rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*" 520 rf"(?:\-{block_end_re}\s*|{block_end_re}))" 521 ) 522 root_parts_re = "|".join( 523 [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules] 524 ) 525 526 # global lexing rules 527 self.rules: t.Dict[str, t.List[_Rule]] = { 528 "root": [ 529 # directives 530 _Rule( 531 c(rf"(.*?)(?:{root_parts_re})"), 532 OptionalLStrip(TOKEN_DATA, "#bygroup"), # type: ignore 533 "#bygroup", 534 ), 535 # data 536 _Rule(c(".+"), TOKEN_DATA, None), 537 ], 538 # comments 539 TOKEN_COMMENT_BEGIN: [ 540 _Rule( 541 c( 542 rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*" 543 rf"|{comment_end_re}{block_suffix_re}))" 544 ), 545 (TOKEN_COMMENT, TOKEN_COMMENT_END), 546 "#pop", 547 ), 548 _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None), 549 ], 550 # blocks 551 TOKEN_BLOCK_BEGIN: [ 552 _Rule( 553 c( 554 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 555 rf"|{block_end_re}{block_suffix_re})" 556 ), 557 TOKEN_BLOCK_END, 558 "#pop", 559 ), 560 ] 561 + tag_rules, 562 # variables 563 TOKEN_VARIABLE_BEGIN: [ 564 _Rule( 565 c(rf"\-{variable_end_re}\s*|{variable_end_re}"), 566 TOKEN_VARIABLE_END, 567 "#pop", 568 ) 569 ] 570 + tag_rules, 571 # raw block 572 TOKEN_RAW_BEGIN: [ 573 _Rule( 574 c( 575 rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*" 576 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 577 rf"|{block_end_re}{block_suffix_re}))" 578 ), 579 OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), # type: ignore 580 "#pop", 581 ), 582 _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None), 583 ], 584 # line statements 585 TOKEN_LINESTATEMENT_BEGIN: [ 586 _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop") 587 ] 588 + tag_rules, 589 # line comments 590 TOKEN_LINECOMMENT_BEGIN: [ 591 _Rule( 592 c(r"(.*?)()(?=\n|$)"), 593 (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END), 594 "#pop", 595 ) 596 ], 597 }
605 def tokenize( 606 self, 607 source: str, 608 name: t.Optional[str] = None, 609 filename: t.Optional[str] = None, 610 state: t.Optional[str] = None, 611 ) -> TokenStream: 612 """Calls tokeniter + tokenize and wraps it in a token stream.""" 613 stream = self.tokeniter(source, name, filename, state) 614 return TokenStream(self.wrap(stream, name, filename), name, filename)
Calls tokeniter + tokenize and wraps it in a token stream.
616 def wrap( 617 self, 618 stream: t.Iterable[t.Tuple[int, str, str]], 619 name: t.Optional[str] = None, 620 filename: t.Optional[str] = None, 621 ) -> t.Iterator[Token]: 622 """This is called with the stream as returned by `tokenize` and wraps 623 every token in a :class:`Token` and converts the value. 624 """ 625 for lineno, token, value_str in stream: 626 if token in ignored_tokens: 627 continue 628 629 value: t.Any = value_str 630 631 if token == TOKEN_LINESTATEMENT_BEGIN: 632 token = TOKEN_BLOCK_BEGIN 633 elif token == TOKEN_LINESTATEMENT_END: 634 token = TOKEN_BLOCK_END 635 # we are not interested in those tokens in the parser 636 elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END): 637 continue 638 elif token == TOKEN_DATA: 639 value = self._normalize_newlines(value_str) 640 elif token == "keyword": 641 token = value_str 642 elif token == TOKEN_NAME: 643 value = value_str 644 645 if not value.isidentifier(): 646 raise TemplateSyntaxError( 647 "Invalid character in identifier", lineno, name, filename 648 ) 649 elif token == TOKEN_STRING: 650 # try to unescape string 651 try: 652 value = ( 653 self._normalize_newlines(value_str[1:-1]) 654 .encode("ascii", "backslashreplace") 655 .decode("unicode-escape") 656 ) 657 except Exception as e: 658 msg = str(e).split(":")[-1].strip() 659 raise TemplateSyntaxError(msg, lineno, name, filename) from e 660 elif token == TOKEN_INTEGER: 661 value = int(value_str.replace("_", ""), 0) 662 elif token == TOKEN_FLOAT: 663 # remove all "_" first to support more Python versions 664 value = literal_eval(value_str.replace("_", "")) 665 elif token == TOKEN_OPERATOR: 666 token = operators[value_str] 667 668 yield Token(lineno, token, value)
670 def tokeniter( 671 self, 672 source: str, 673 name: t.Optional[str], 674 filename: t.Optional[str] = None, 675 state: t.Optional[str] = None, 676 ) -> t.Iterator[t.Tuple[int, str, str]]: 677 """This method tokenizes the text and returns the tokens in a 678 generator. Use this method if you just want to tokenize a template. 679 680 .. versionchanged:: 3.0 681 Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line 682 breaks. 683 """ 684 lines = newline_re.split(source)[::2] 685 686 if not self.keep_trailing_newline and lines[-1] == "": 687 del lines[-1] 688 689 source = "\n".join(lines) 690 pos = 0 691 lineno = 1 692 stack = ["root"] 693 694 if state is not None and state != "root": 695 assert state in ("variable", "block"), "invalid state" 696 stack.append(state + "_begin") 697 698 statetokens = self.rules[stack[-1]] 699 source_length = len(source) 700 balancing_stack: t.List[str] = [] 701 newlines_stripped = 0 702 line_starting = True 703 704 while True: 705 # tokenizer loop 706 for regex, tokens, new_state in statetokens: 707 m = regex.match(source, pos) 708 709 # if no match we try again with the next rule 710 if m is None: 711 continue 712 713 # we only match blocks and variables if braces / parentheses 714 # are balanced. continue parsing with the lower rule which 715 # is the operator rule. do this only if the end tags look 716 # like operators 717 if balancing_stack and tokens in ( 718 TOKEN_VARIABLE_END, 719 TOKEN_BLOCK_END, 720 TOKEN_LINESTATEMENT_END, 721 ): 722 continue 723 724 # tuples support more options 725 if isinstance(tokens, tuple): 726 groups: t.Sequence[str] = m.groups() 727 728 if isinstance(tokens, OptionalLStrip): 729 # Rule supports lstrip. Match will look like 730 # text, block type, whitespace control, type, control, ... 731 text = groups[0] 732 # Skipping the text and first type, every other group is the 733 # whitespace control for each type. One of the groups will be 734 # -, +, or empty string instead of None. 735 strip_sign = next(g for g in groups[2::2] if g is not None) 736 737 if strip_sign == "-": 738 # Strip all whitespace between the text and the tag. 739 stripped = text.rstrip() 740 newlines_stripped = text[len(stripped) :].count("\n") 741 groups = [stripped, *groups[1:]] 742 elif ( 743 # Not marked for preserving whitespace. 744 strip_sign != "+" 745 # lstrip is enabled. 746 and self.lstrip_blocks 747 # Not a variable expression. 748 and not m.groupdict().get(TOKEN_VARIABLE_BEGIN) 749 ): 750 # The start of text between the last newline and the tag. 751 l_pos = text.rfind("\n") + 1 752 753 if l_pos > 0 or line_starting: 754 # If there's only whitespace between the newline and the 755 # tag, strip it. 756 if whitespace_re.fullmatch(text, l_pos): 757 groups = [text[:l_pos], *groups[1:]] 758 759 for idx, token in enumerate(tokens): 760 # failure group 761 if isinstance(token, Failure): 762 raise token(lineno, filename) 763 # bygroup is a bit more complex, in that case we 764 # yield for the current token the first named 765 # group that matched 766 elif token == "#bygroup": 767 for key, value in m.groupdict().items(): 768 if value is not None: 769 yield lineno, key, value 770 lineno += value.count("\n") 771 break 772 else: 773 raise RuntimeError( 774 f"{regex!r} wanted to resolve the token dynamically" 775 " but no group matched" 776 ) 777 # normal group 778 else: 779 data = groups[idx] 780 781 if data or token not in ignore_if_empty: 782 yield lineno, token, data # type: ignore[misc] 783 784 lineno += data.count("\n") + newlines_stripped 785 newlines_stripped = 0 786 787 # strings as token just are yielded as it. 788 else: 789 data = m.group() 790 791 # update brace/parentheses balance 792 if tokens == TOKEN_OPERATOR: 793 if data == "{": 794 balancing_stack.append("}") 795 elif data == "(": 796 balancing_stack.append(")") 797 elif data == "[": 798 balancing_stack.append("]") 799 elif data in ("}", ")", "]"): 800 if not balancing_stack: 801 raise TemplateSyntaxError( 802 f"unexpected '{data}'", lineno, name, filename 803 ) 804 805 expected_op = balancing_stack.pop() 806 807 if expected_op != data: 808 raise TemplateSyntaxError( 809 f"unexpected '{data}', expected '{expected_op}'", 810 lineno, 811 name, 812 filename, 813 ) 814 815 # yield items 816 if data or tokens not in ignore_if_empty: 817 yield lineno, tokens, data 818 819 lineno += data.count("\n") 820 821 line_starting = m.group()[-1:] == "\n" 822 # fetch new position into new variable so that we can check 823 # if there is a internal parsing error which would result 824 # in an infinite loop 825 pos2 = m.end() 826 827 # handle state changes 828 if new_state is not None: 829 # remove the uppermost state 830 if new_state == "#pop": 831 stack.pop() 832 # resolve the new state by group checking 833 elif new_state == "#bygroup": 834 for key, value in m.groupdict().items(): 835 if value is not None: 836 stack.append(key) 837 break 838 else: 839 raise RuntimeError( 840 f"{regex!r} wanted to resolve the new state dynamically" 841 f" but no group matched" 842 ) 843 # direct state name given 844 else: 845 stack.append(new_state) 846 847 statetokens = self.rules[stack[-1]] 848 # we are still at the same position and no stack change. 849 # this means a loop without break condition, avoid that and 850 # raise error 851 elif pos2 == pos: 852 raise RuntimeError( 853 f"{regex!r} yielded empty string without stack change" 854 ) 855 856 # publish new function and start again 857 pos = pos2 858 break 859 # if loop terminated without break we haven't found a single match 860 # either we are at the end of the file or we have a problem 861 else: 862 # end of text 863 if pos >= source_length: 864 return 865 866 # something went wrong 867 raise TemplateSyntaxError( 868 f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename 869 )
This method tokenizes the text and returns the tokens in a generator. Use this method if you just want to tokenize a template.
Changed in version 3.0:
Only \n
, \r\n
and \r
are treated as line
breaks.