Skip to content
42 changes: 30 additions & 12 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from test import support
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens)
open as tokenize_open, Untokenizer, generate_tokens,
NEWLINE)
from io import BytesIO, StringIO
import unittest
from unittest import TestCase, mock
Expand All @@ -15,23 +16,34 @@ class TokenizeTest(TestCase):
# Tests for the tokenize module.

# The tests can be really simple. Given a small fragment of source
# code, print out a table with tokens. The ENDMARKER is omitted for
# brevity.
# code, print out a table with tokens. The ENDMARKER, ENCODING and
# final NEWLINE are omitted for brevity.

def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
# The ENDMARKER is omitted.
# The ENDMARKER and final NEWLINE are omitted.
result = []
f = BytesIO(s.encode('utf-8'))
num_lines = len(s.splitlines())
for type, token, start, end, line in tokenize(f.readline):
if type == ENDMARKER:
break
if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check is quite unreadable. Not simple to make more readable though. A short comment could help.

continue
type = tok_name[type]
result.append(f" {type:10} {token!r:13} {start} {end}")
self.assertEqual(result,
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
expected.rstrip().splitlines())

def test_implicit_newline(self):
# Make sure that the tokenizer puts in an implicit NEWLINE
# when the input lacks a trailing new line.
f = BytesIO("x".encode('utf-8'))
tokens = list(tokenize(f.readline))
self.assertEqual(tokens[-2].type, NEWLINE)
self.assertEqual(tokens[-1].type, ENDMARKER)

def test_basic(self):
self.check_tokenize("1 + 1", """\
NUMBER '1' (1, 0) (1, 1)
Expand Down Expand Up @@ -922,12 +934,15 @@ async def bar(): pass
class GenerateTokensTest(TokenizeTest):
def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
# The ENDMARKER is omitted.
# The ENDMARKER and final NEWLINE are omitted.
result = []
f = StringIO(s)
num_lines = len(s.splitlines())
for type, token, start, end, line in generate_tokens(f.readline):
if type == ENDMARKER:
break
if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines:
continue
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The check_tokenize() logic is repeated exactly, but it is now becoming rather involved. This should be a common function used by both classes.

type = tok_name[type]
result.append(f" {type:10} {token!r:13} {start} {end}")
self.assertEqual(result, expected.rstrip().splitlines())
Expand Down Expand Up @@ -1022,8 +1037,8 @@ def readline():
else:
return b''

# skip the initial encoding token and the end token
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
# skip the initial encoding token and the end tokens
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
self.assertEqual(tokens, expected_tokens,
"bytes not decoded with encoding")
Expand All @@ -1039,8 +1054,8 @@ def readline():
else:
return b''

# skip the end token
tokens = list(_tokenize(readline, encoding=None))[:-1]
# skip the end tokens
tokens = list(_tokenize(readline, encoding=None))[:-2]
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
self.assertEqual(tokens, expected_tokens,
"string not tokenized when encoding is None")
Expand Down Expand Up @@ -1351,18 +1366,21 @@ def test_oneline_defs(self):

# Test that 500 consequent, one-line defs is OK
toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
# [-2] is always NEWLINE

def assertExactTypeEqual(self, opstr, *optypes):
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
num_optypes = len(optypes)
self.assertEqual(len(tokens), 2 + num_optypes)
self.assertEqual(len(tokens), 3 + num_optypes)
self.assertEqual(tok_name[tokens[0].exact_type],
tok_name[ENCODING])
for i in range(num_optypes):
self.assertEqual(tok_name[tokens[i + 1].exact_type],
tok_name[optypes[i]])
self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
tok_name[token.NEWLINE])
self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
tok_name[token.ENDMARKER])

def test_exact_type(self):
Expand Down Expand Up @@ -1515,7 +1533,7 @@ def test_roundtrip(self):
self.check_roundtrip("if x == 1:\n"
" print(x)\n")
self.check_roundtrip("# This is a comment\n"
"# This also")
"# This also\n")

# Some people use different formatting conventions, which makes
# untokenize a little trickier. Note that this test involves trailing
Expand Down
10 changes: 10 additions & 0 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,15 @@ def _tokenize(readline, encoding):
# BOM will already have been stripped.
encoding = "utf-8"
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
last_line = b''
line = b''
while True: # loop over lines in stream
try:
# We capture the value of the line variable here because
# readline uses the empty string '' to signal end of input,
# hence `line` itself will always be overwritten at the end
# of this loop.
last_line = line
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't last_line only be set after StopIteration is caught? ISTM that in other cases we wouldn't want to be adding the newline at the end.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

readline is one of the ancient APIs that existed before generators. There's two ways of stopping iteration, either raising StopIteration or returning the empty string, the latter gets caught all the way down here https://github.com/python/cpython/blob/master/Lib/tokenize.py#L528

How you're describing it is what I had initially but there's a few places in the loop where iteration can stop so I thought this would be simpler.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then perhaps add a comment to that end? It seems rather crucial to understanding that particular piece of the code.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, added.

line = readline()
except StopIteration:
line = b''
Expand Down Expand Up @@ -648,6 +655,9 @@ def _tokenize(readline, encoding):
(lnum, pos), (lnum, pos+1), line)
pos += 1

# Add an implicit NEWLINE if the input doesn't end in one
if len(last_line) > 0 and last_line[-1] not in '\r\n':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should begin if last_line and ....

yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
for indent in indents[1:]: # pop remaining indent levels
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Tokenize module now implicitly emits a NEWLINE when provided with input that
does not have a trailing new line. This behavior now matches what the C
tokenizer does internally.