| 
 | 1 | +# Copyright (c) Microsoft Corporation. All rights reserved.  | 
 | 2 | +# Licensed under the MIT License.  | 
 | 3 | + | 
 | 4 | +import ast  | 
 | 5 | +import textwrap  | 
 | 6 | +import re  | 
 | 7 | +import sys  | 
 | 8 | + | 
 | 9 | + | 
 | 10 | +def split_lines(source):  | 
 | 11 | + """  | 
 | 12 | + Split selection lines in a version-agnostic way.  | 
 | 13 | +
  | 
 | 14 | + Python grammar only treats \r, \n, and \r\n as newlines.  | 
 | 15 | + But splitlines() in Python 3 has a much larger list: for example, it also includes \v, \f.  | 
 | 16 | + As such, this function will split lines across all Python versions.  | 
 | 17 | + """  | 
 | 18 | + return re.split(r"[\n\r]+", source)  | 
 | 19 | + | 
 | 20 | + | 
 | 21 | +def _get_statements(selection):  | 
 | 22 | + """  | 
 | 23 | + Process a multiline selection into a list of its top-level statements.  | 
 | 24 | + This will remove empty newlines around and within the selection, dedent it,  | 
 | 25 | + and split it using the result of `ast.parse()`.  | 
 | 26 | + """  | 
 | 27 | + | 
 | 28 | + # Remove blank lines within the selection to prevent the REPL from thinking the block is finished.  | 
 | 29 | + lines = (line for line in split_lines(selection) if line.strip() != "")  | 
 | 30 | + | 
 | 31 | + # Dedent the selection and parse it using the ast module.  | 
 | 32 | + # Note that leading comments in the selection will be discarded during parsing.  | 
 | 33 | + source = textwrap.dedent("\n".join(lines))  | 
 | 34 | + tree = ast.parse(source)  | 
 | 35 | + | 
 | 36 | + # We'll need the dedented lines to rebuild the selection.  | 
 | 37 | + lines = split_lines(source)  | 
 | 38 | + | 
 | 39 | + # Get the line ranges for top-level blocks returned from parsing the dedented text  | 
 | 40 | + # and split the selection accordingly.  | 
 | 41 | + # tree.body is a list of AST objects, which we rely on to extract top-level statements.  | 
 | 42 | + # If we supported Python 3.8+ only we could use the lineno and end_lineno attributes of each object  | 
 | 43 | + # to get the boundaries of each block.  | 
 | 44 | + # However, earlier Python versions only have the lineno attribute, which is the range start position (1-indexed).  | 
 | 45 | + # Therefore, to retrieve the end line of each block in a version-agnostic way we need to do  | 
 | 46 | + # `end = next_block.lineno - 1`  | 
 | 47 | + # for all blocks except the last one, which will will just run until the last line.  | 
 | 48 | + ends = [node.lineno - 1 for node in tree.body[1:]] + [len(lines)]  | 
 | 49 | + for node, end in zip(tree.body, ends):  | 
 | 50 | + # Given this selection:  | 
 | 51 | + # 1: if (m > 0 and  | 
 | 52 | + # 2: n < 3):  | 
 | 53 | + # 3: print('foo')  | 
 | 54 | + # 4: value = 'bar'  | 
 | 55 | + #  | 
 | 56 | + # The first block would have lineno = 1,and the second block lineno = 4  | 
 | 57 | + start = node.lineno - 1  | 
 | 58 | + block = "\n".join(lines[start:end])  | 
 | 59 | + | 
 | 60 | + # If the block is multiline, add an extra newline character at its end.  | 
 | 61 | + # This way, when joining blocks back together, there will be a blank line between each multiline statement  | 
 | 62 | + # and no blank lines between single-line statements, or it would look like this:  | 
 | 63 | + # >>> x = 22  | 
 | 64 | + # >>>  | 
 | 65 | + # >>> total = x + 30  | 
 | 66 | + # >>>  | 
 | 67 | + # Note that for the multiline parentheses case this newline is redundant,  | 
 | 68 | + # since the closing parenthesis terminates the statement already.  | 
 | 69 | + # This means that for this pattern we'll end up with:  | 
 | 70 | + # >>> x = [  | 
 | 71 | + # ... 1  | 
 | 72 | + # ... ]  | 
 | 73 | + # >>>  | 
 | 74 | + # >>> y = [  | 
 | 75 | + # ... 2  | 
 | 76 | + # ...]  | 
 | 77 | + if end - start > 1:  | 
 | 78 | + block += "\n"  | 
 | 79 | + | 
 | 80 | + yield block  | 
 | 81 | + | 
 | 82 | + | 
 | 83 | +def normalize_lines(selection):  | 
 | 84 | + """  | 
 | 85 | + Normalize the text selection received from the extension and send it to the REPL.  | 
 | 86 | +
  | 
 | 87 | + If it is a single line selection, dedent it, append a newline and send it to the REPL.  | 
 | 88 | + Otherwise, sanitize the multiline selection before sending it to the REPL:  | 
 | 89 | + split it in a list of top-level statements  | 
 | 90 | + and add newlines between each of them to tell the REPL where each block ends.  | 
 | 91 | + """  | 
 | 92 | + | 
 | 93 | + try:  | 
 | 94 | + # Parse the selection into a list of top-level blocks.  | 
 | 95 | + # We don't differentiate between single and multiline statements  | 
 | 96 | + # because it's not a perf bottleneck,  | 
 | 97 | + # and the overhead from splitting and rejoining strings in the multiline case is one-off.  | 
 | 98 | + statements = _get_statements(selection)  | 
 | 99 | + | 
 | 100 | + # Insert a newline between each top-level statement, and append a newline to the selection.  | 
 | 101 | + source = "\n".join(statements) + "\n"  | 
 | 102 | + except:  | 
 | 103 | + # If there's a problem when parsing statements,  | 
 | 104 | + # append a blank line to end the block and send it as-is.  | 
 | 105 | + source = selection + "\n\n"  | 
 | 106 | + | 
 | 107 | + # `source` is a unicode instance at this point on Python 2,  | 
 | 108 | + # so if we used `sys.stdout.write` to send it to the REPL,  | 
 | 109 | + # Python will implicitly encode it using sys.getdefaultencoding(),  | 
 | 110 | + # which we don't want.  | 
 | 111 | + stdout = sys.stdout if sys.version_info < (3,) else sys.stdout.buffer  | 
 | 112 | + stdout.write(source.encode("utf-8"))  | 
 | 113 | + stdout.flush()  | 
 | 114 | + | 
 | 115 | + | 
 | 116 | +if __name__ == "__main__":  | 
 | 117 | + # This will fail on a large file.  | 
 | 118 | + # See https://github.com/microsoft/vscode-python/issues/14471  | 
 | 119 | + contents = sys.argv[1]  | 
 | 120 | + try:  | 
 | 121 | + default_encoding = sys.getdefaultencoding()  | 
 | 122 | + encoded_contents = contents.encode(default_encoding, "surrogateescape")  | 
 | 123 | + contents = encoded_contents.decode(default_encoding, "replace")  | 
 | 124 | + except (UnicodeError, LookupError):  | 
 | 125 | + pass  | 
 | 126 | + if isinstance(contents, bytes):  | 
 | 127 | + contents = contents.decode("utf8")  | 
 | 128 | + normalize_lines(contents)  | 
0 commit comments