11from test import support
22from tokenize import (tokenize , _tokenize , untokenize , NUMBER , NAME , OP ,
33 STRING , ENDMARKER , ENCODING , tok_name , detect_encoding ,
4- open as tokenize_open , Untokenizer )
4+ open as tokenize_open , Untokenizer , generate_tokens ,
5+ NEWLINE )
56from io import BytesIO
67import unittest
78from unittest import TestCase , mock
1112import token
1213
1314
15+ # Converts a source string into a list of textual representation
16+ # of the tokens such as:
17+ # ` NAME 'if' (1, 0) (1, 2)`
18+ # to make writing tests easier.
19+ def stringify_tokens_from_source (token_generator , source_string ):
20+ result = []
21+ num_lines = len (source_string .splitlines ())
22+ missing_trailing_nl = source_string [- 1 ] not in '\r \n '
23+
24+ for type , token , start , end , line in token_generator :
25+ if type == ENDMARKER :
26+ break
27+ # Ignore the new line on the last line if the input lacks one
28+ if missing_trailing_nl and type == NEWLINE and end [0 ] == num_lines :
29+ continue
30+ type = tok_name [type ]
31+ result .append (f" { type :10} { token !r:13} { start } { end } " )
32+
33+ return result
34+
1435class TokenizeTest (TestCase ):
1536 # Tests for the tokenize module.
1637
1738 # The tests can be really simple. Given a small fragment of source
18- # code, print out a table with tokens. The ENDMARKER is omitted for
19- # brevity.
39+ # code, print out a table with tokens. The ENDMARKER, ENCODING and
40+ # final NEWLINE are omitted for brevity.
2041
2142 def check_tokenize (self , s , expected ):
2243 # Format the tokens in s in a table format.
23- # The ENDMARKER is omitted.
24- result = []
44+ # The ENDMARKER and final NEWLINE are omitted.
2545 f = BytesIO (s .encode ('utf-8' ))
26- for type , token , start , end , line in tokenize (f .readline ):
27- if type == ENDMARKER :
28- break
29- type = tok_name [type ]
30- result .append (f" { type :10} { token !r:13} { start } { end } " )
46+ result = stringify_tokens_from_source (tokenize (f .readline ), s )
47+
3148 self .assertEqual (result ,
3249 [" ENCODING 'utf-8' (0, 0) (0, 0)" ] +
3350 expected .rstrip ().splitlines ())
3451
52+ def test_implicit_newline (self ):
53+ # Make sure that the tokenizer puts in an implicit NEWLINE
54+ # when the input lacks a trailing new line.
55+ f = BytesIO ("x" .encode ('utf-8' ))
56+ tokens = list (tokenize (f .readline ))
57+ self .assertEqual (tokens [- 2 ].type , NEWLINE )
58+ self .assertEqual (tokens [- 1 ].type , ENDMARKER )
59+
3560 def test_basic (self ):
3661 self .check_tokenize ("1 + 1" , """\
3762 NUMBER '1' (1, 0) (1, 1)
@@ -1009,8 +1034,8 @@ def readline():
10091034 else :
10101035 return b''
10111036
1012- # skip the initial encoding token and the end token
1013- tokens = list (_tokenize (readline , encoding = 'utf-8' ))[1 :- 1 ]
1037+ # skip the initial encoding token and the end tokens
1038+ tokens = list (_tokenize (readline , encoding = 'utf-8' ))[1 :- 2 ]
10141039 expected_tokens = [(3 , '"ЉЊЈЁЂ"' , (1 , 0 ), (1 , 7 ), '"ЉЊЈЁЂ"' )]
10151040 self .assertEqual (tokens , expected_tokens ,
10161041 "bytes not decoded with encoding" )
@@ -1026,8 +1051,8 @@ def readline():
10261051 else :
10271052 return b''
10281053
1029- # skip the end token
1030- tokens = list (_tokenize (readline , encoding = None ))[:- 1 ]
1054+ # skip the end tokens
1055+ tokens = list (_tokenize (readline , encoding = None ))[:- 2 ]
10311056 expected_tokens = [(3 , '"ЉЊЈЁЂ"' , (1 , 0 ), (1 , 7 ), '"ЉЊЈЁЂ"' )]
10321057 self .assertEqual (tokens , expected_tokens ,
10331058 "string not tokenized when encoding is None" )
@@ -1338,18 +1363,21 @@ def test_oneline_defs(self):
13381363
13391364 # Test that 500 consequent, one-line defs is OK
13401365 toks = list (tokenize (BytesIO (buf .encode ('utf-8' )).readline ))
1341- self .assertEqual (toks [- 2 ].string , 'OK' ) # [-1] is always ENDMARKER
1366+ self .assertEqual (toks [- 3 ].string , 'OK' ) # [-1] is always ENDMARKER
1367+ # [-2] is always NEWLINE
13421368
13431369 def assertExactTypeEqual (self , opstr , * optypes ):
13441370 tokens = list (tokenize (BytesIO (opstr .encode ('utf-8' )).readline ))
13451371 num_optypes = len (optypes )
1346- self .assertEqual (len (tokens ), 2 + num_optypes )
1372+ self .assertEqual (len (tokens ), 3 + num_optypes )
13471373 self .assertEqual (tok_name [tokens [0 ].exact_type ],
13481374 tok_name [ENCODING ])
13491375 for i in range (num_optypes ):
13501376 self .assertEqual (tok_name [tokens [i + 1 ].exact_type ],
13511377 tok_name [optypes [i ]])
13521378 self .assertEqual (tok_name [tokens [1 + num_optypes ].exact_type ],
1379+ tok_name [token .NEWLINE ])
1380+ self .assertEqual (tok_name [tokens [2 + num_optypes ].exact_type ],
13531381 tok_name [token .ENDMARKER ])
13541382
13551383 def test_exact_type (self ):
@@ -1502,7 +1530,7 @@ def test_roundtrip(self):
15021530 self .check_roundtrip ("if x == 1:\n "
15031531 " print(x)\n " )
15041532 self .check_roundtrip ("# This is a comment\n "
1505- "# This also" )
1533+ "# This also\n " )
15061534
15071535 # Some people use different formatting conventions, which makes
15081536 # untokenize a little trickier. Note that this test involves trailing
0 commit comments