aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Lib/_pyrepl/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/_pyrepl/utils.py')
-rw-r--r--Lib/_pyrepl/utils.py290
1 files changed, 284 insertions, 6 deletions
diff --git a/Lib/_pyrepl/utils.py b/Lib/_pyrepl/utils.py
index 7437fbe1ab9..fe154aa59a0 100644
--- a/Lib/_pyrepl/utils.py
+++ b/Lib/_pyrepl/utils.py
@@ -1,6 +1,17 @@
+from __future__ import annotations
+import builtins
+import functools
+import keyword
import re
+import token as T
+import tokenize
import unicodedata
-import functools
+import _colorize
+
+from collections import deque
+from io import StringIO
+from tokenize import TokenInfo as TI
+from typing import Iterable, Iterator, Match, NamedTuple, Self
from .types import CharBuffer, CharWidths
from .trace import trace
@@ -8,6 +19,32 @@ from .trace import trace
ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@]*[A-~]")
ZERO_WIDTH_BRACKET = re.compile(r"\x01.*?\x02")
ZERO_WIDTH_TRANS = str.maketrans({"\x01": "", "\x02": ""})
+IDENTIFIERS_AFTER = {"def", "class"}
+BUILTINS = {str(name) for name in dir(builtins) if not name.startswith('_')}
+
+
+class Span(NamedTuple):
+ """Span indexing that's inclusive on both ends."""
+
+ start: int
+ end: int
+
+ @classmethod
+ def from_re(cls, m: Match[str], group: int | str) -> Self:
+ re_span = m.span(group)
+ return cls(re_span[0], re_span[1] - 1)
+
+ @classmethod
+ def from_token(cls, token: TI, line_len: list[int]) -> Self:
+ return cls(
+ line_len[token.start[0] - 1] + token.start[1],
+ line_len[token.end[0] - 1] + token.end[1] - 1,
+ )
+
+
+class ColorSpan(NamedTuple):
+ span: Span
+ tag: _colorize.ColorTag
@functools.cache
@@ -41,17 +78,207 @@ def unbracket(s: str, including_content: bool = False) -> str:
return s.translate(ZERO_WIDTH_TRANS)
-def disp_str(buffer: str) -> tuple[CharBuffer, CharWidths]:
- r"""Decompose the input buffer into a printable variant.
+def gen_colors(buffer: str) -> Iterator[ColorSpan]:
+ """Returns a list of index spans to color using the given color tag.
+
+ The input `buffer` should be a valid start of a Python code block, i.e.
+ it cannot be a block starting in the middle of a multiline string.
+ """
+ sio = StringIO(buffer)
+ line_lengths = [0] + [len(line) for line in sio.readlines()]
+ # make line_lengths cumulative
+ for i in range(1, len(line_lengths)):
+ line_lengths[i] += line_lengths[i-1]
+
+ sio.seek(0)
+ gen = tokenize.generate_tokens(sio.readline)
+ last_emitted: ColorSpan | None = None
+ try:
+ for color in gen_colors_from_token_stream(gen, line_lengths):
+ yield color
+ last_emitted = color
+ except tokenize.TokenError as te:
+ yield from recover_unterminated_string(
+ te, line_lengths, last_emitted, buffer
+ )
+
+
+def recover_unterminated_string(
+ exc: tokenize.TokenError,
+ line_lengths: list[int],
+ last_emitted: ColorSpan | None,
+ buffer: str,
+) -> Iterator[ColorSpan]:
+ msg, loc = exc.args
+ if loc is None:
+ return
+
+ line_no, column = loc
+
+ if msg.startswith(
+ (
+ "unterminated string literal",
+ "unterminated f-string literal",
+ "unterminated t-string literal",
+ "EOF in multi-line string",
+ "unterminated triple-quoted f-string literal",
+ "unterminated triple-quoted t-string literal",
+ )
+ ):
+ start = line_lengths[line_no - 1] + column - 1
+ end = line_lengths[-1] - 1
+
+ # in case FSTRING_START was already emitted
+ if last_emitted and start <= last_emitted.span.start:
+ trace("before last emitted = {s}", s=start)
+ start = last_emitted.span.end + 1
+
+ span = Span(start, end)
+ trace("yielding span {a} -> {b}", a=span.start, b=span.end)
+ yield ColorSpan(span, "STRING")
+ else:
+ trace(
+ "unhandled token error({buffer}) = {te}",
+ buffer=repr(buffer),
+ te=str(exc),
+ )
+
+
+def gen_colors_from_token_stream(
+ token_generator: Iterator[TI],
+ line_lengths: list[int],
+) -> Iterator[ColorSpan]:
+ token_window = prev_next_window(token_generator)
+
+ is_def_name = False
+ bracket_level = 0
+ for prev_token, token, next_token in token_window:
+ assert token is not None
+ if token.start == token.end:
+ continue
+
+ match token.type:
+ case (
+ T.STRING
+ | T.FSTRING_START | T.FSTRING_MIDDLE | T.FSTRING_END
+ | T.TSTRING_START | T.TSTRING_MIDDLE | T.TSTRING_END
+ ):
+ span = Span.from_token(token, line_lengths)
+ yield ColorSpan(span, "STRING")
+ case T.COMMENT:
+ span = Span.from_token(token, line_lengths)
+ yield ColorSpan(span, "COMMENT")
+ case T.NUMBER:
+ span = Span.from_token(token, line_lengths)
+ yield ColorSpan(span, "NUMBER")
+ case T.OP:
+ if token.string in "([{":
+ bracket_level += 1
+ elif token.string in ")]}":
+ bracket_level -= 1
+ span = Span.from_token(token, line_lengths)
+ yield ColorSpan(span, "OP")
+ case T.NAME:
+ if is_def_name:
+ is_def_name = False
+ span = Span.from_token(token, line_lengths)
+ yield ColorSpan(span, "DEFINITION")
+ elif keyword.iskeyword(token.string):
+ span = Span.from_token(token, line_lengths)
+ yield ColorSpan(span, "KEYWORD")
+ if token.string in IDENTIFIERS_AFTER:
+ is_def_name = True
+ elif (
+ keyword.issoftkeyword(token.string)
+ and bracket_level == 0
+ and is_soft_keyword_used(prev_token, token, next_token)
+ ):
+ span = Span.from_token(token, line_lengths)
+ yield ColorSpan(span, "SOFT_KEYWORD")
+ elif token.string in BUILTINS:
+ span = Span.from_token(token, line_lengths)
+ yield ColorSpan(span, "BUILTIN")
+
+
+keyword_first_sets_match = {"False", "None", "True", "await", "lambda", "not"}
+keyword_first_sets_case = {"False", "None", "True"}
+
+
+def is_soft_keyword_used(*tokens: TI | None) -> bool:
+ """Returns True if the current token is a keyword in this context.
+
+ For the `*tokens` to match anything, they have to be a three-tuple of
+ (previous, current, next).
+ """
+ trace("is_soft_keyword_used{t}", t=tokens)
+ match tokens:
+ case (
+ None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"),
+ TI(string="match"),
+ TI(T.NUMBER | T.STRING | T.FSTRING_START | T.TSTRING_START)
+ | TI(T.OP, string="(" | "*" | "[" | "{" | "~" | "...")
+ ):
+ return True
+ case (
+ None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"),
+ TI(string="match"),
+ TI(T.NAME, string=s)
+ ):
+ if keyword.iskeyword(s):
+ return s in keyword_first_sets_match
+ return True
+ case (
+ None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"),
+ TI(string="case"),
+ TI(T.NUMBER | T.STRING | T.FSTRING_START | T.TSTRING_START)
+ | TI(T.OP, string="(" | "*" | "-" | "[" | "{")
+ ):
+ return True
+ case (
+ None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"),
+ TI(string="case"),
+ TI(T.NAME, string=s)
+ ):
+ if keyword.iskeyword(s):
+ return s in keyword_first_sets_case
+ return True
+ case (TI(string="case"), TI(string="_"), TI(string=":")):
+ return True
+ case _:
+ return False
+
+
+def disp_str(
+ buffer: str, colors: list[ColorSpan] | None = None, start_index: int = 0
+) -> tuple[CharBuffer, CharWidths]:
+ r"""Decompose the input buffer into a printable variant with applied colors.
Returns a tuple of two lists:
- - the first list is the input buffer, character by character;
+ - the first list is the input buffer, character by character, with color
+ escape codes added (while those codes contain multiple ASCII characters,
+ each code is considered atomic *and is attached for the corresponding
+ visible character*);
- the second list is the visible width of each character in the input
buffer.
+ Note on colors:
+ - The `colors` list, if provided, is partially consumed within. We're using
+ a list and not a generator since we need to hold onto the current
+ unfinished span between calls to disp_str in case of multiline strings.
+ - The `colors` list is computed from the start of the input block. `buffer`
+ is only a subset of that input block, a single line within. This is why
+ we need `start_index` to inform us which position is the start of `buffer`
+ actually within user input. This allows us to match color spans correctly.
+
Examples:
>>> utils.disp_str("a = 9")
(['a', ' ', '=', ' ', '9'], [1, 1, 1, 1, 1])
+
+ >>> line = "while 1:"
+ >>> colors = list(utils.gen_colors(line))
+ >>> utils.disp_str(line, colors=colors)
+ (['\x1b[1;34mw', 'h', 'i', 'l', 'e\x1b[0m', ' ', '1', ':'], [1, 1, 1, 1, 1, 1, 1, 1])
+
"""
chars: CharBuffer = []
char_widths: CharWidths = []
@@ -59,7 +286,20 @@ def disp_str(buffer: str) -> tuple[CharBuffer, CharWidths]:
if not buffer:
return chars, char_widths
- for c in buffer:
+ while colors and colors[0].span.end < start_index:
+ # move past irrelevant spans
+ colors.pop(0)
+
+ pre_color = ""
+ post_color = ""
+ if colors and colors[0].span.start < start_index:
+ # looks like we're continuing a previous color (e.g. a multiline str)
+ pre_color = _colorize.theme[colors[0].tag]
+
+ for i, c in enumerate(buffer, start_index):
+ if colors and colors[0].span.start == i: # new color starts now
+ pre_color = _colorize.theme[colors[0].tag]
+
if c == "\x1a": # CTRL-Z on Windows
chars.append(c)
char_widths.append(2)
@@ -73,5 +313,43 @@ def disp_str(buffer: str) -> tuple[CharBuffer, CharWidths]:
else:
chars.append(c)
char_widths.append(str_width(c))
- trace("disp_str({buffer}) = {s}, {b}", buffer=repr(buffer), s=chars, b=char_widths)
+
+ if colors and colors[0].span.end == i: # current color ends now
+ post_color = _colorize.theme["RESET"]
+ colors.pop(0)
+
+ chars[-1] = pre_color + chars[-1] + post_color
+ pre_color = ""
+ post_color = ""
+
+ if colors and colors[0].span.start < i and colors[0].span.end > i:
+ # even though the current color should be continued, reset it for now.
+ # the next call to `disp_str()` will revive it.
+ chars[-1] += _colorize.theme["RESET"]
+
return chars, char_widths
+
+
+def prev_next_window[T](
+ iterable: Iterable[T]
+) -> Iterator[tuple[T | None, ...]]:
+ """Generates three-tuples of (previous, current, next) items.
+
+ On the first iteration previous is None. On the last iteration next
+ is None. In case of exception next is None and the exception is re-raised
+ on a subsequent next() call.
+
+ Inspired by `sliding_window` from `itertools` recipes.
+ """
+
+ iterator = iter(iterable)
+ window = deque((None, next(iterator)), maxlen=3)
+ try:
+ for x in iterator:
+ window.append(x)
+ yield tuple(window)
+ except Exception:
+ raise
+ finally:
+ window.append(None)
+ yield tuple(window)