Skip to content

Commit 49df9c1

Browse files
committed
bpo-46054: Fix parsing error when parsing non-utf8 characters in source files
1 parent 59435ee commit 49df9c1

File tree

3 files changed

+19
-8
lines changed

3 files changed

+19
-8
lines changed

Lib/test/test_exceptions.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2387,6 +2387,18 @@ def test_encodings(self):
23872387
finally:
23882388
unlink(TESTFN)
23892389

2390+
def test_non_utf8(self):
2391+
# Check non utf-8 characters
2392+
try:
2393+
with open(TESTFN, 'bw') as testfile:
2394+
testfile.write(b'\x7fELF\x02\x01\x01\x00\x00\x00')
2395+
rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
2396+
err = err.decode('utf-8').splitlines()
2397+
2398+
self.assertEqual(err[-1], "SyntaxError: invalid non-printable character U+007F")
2399+
finally:
2400+
unlink(TESTFN)
2401+
23902402
def test_attributes_new_constructor(self):
23912403
args = ("bad.py", 1, 2, "abcdefg", 1, 100)
23922404
the_exception = SyntaxError("bad bad", args)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix parser error when parsing non-utf8 characters in source files. Patch by
2+
Pablo Galindo.

Parser/tokenizer.c

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -819,10 +819,10 @@ tok_readline_raw(struct tok_state *tok)
819819
tok_concatenate_interactive_new_line(tok, line) == -1) {
820820
return 0;
821821
}
822-
if (*tok->inp == '\0') {
822+
tok->inp = strchr(tok->inp, '\0');
823+
if (tok->inp == tok->buf) {
823824
return 0;
824825
}
825-
tok->inp = strchr(tok->inp, '\0');
826826
} while (tok->inp[-1] != '\n');
827827
return 1;
828828
}
@@ -984,12 +984,9 @@ tok_underflow_file(struct tok_state *tok) {
984984
}
985985
/* The default encoding is UTF-8, so make sure we don't have any
986986
non-UTF-8 sequences in it. */
987-
if (!tok->encoding
988-
&& (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
989-
if (!ensure_utf8(tok->cur, tok)) {
990-
error_ret(tok);
991-
return 0;
992-
}
987+
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
988+
error_ret(tok);
989+
return 0;
993990
}
994991
assert(tok->done == E_OK);
995992
return tok->done == E_OK;

0 commit comments

Comments
 (0)