changeset: 97713:48ae9d66c720 parent: 97710:d51a82f68a70 parent: 97712:1f6155ffcaf6 user: Ezio Melotti date: Sun Sep 06 21:49:48 2015 +0300 files: Misc/NEWS description: #23144: merge with 3.5. diff -r d51a82f68a70 -r 48ae9d66c720 Lib/html/parser.py --- a/Lib/html/parser.py Sun Sep 06 21:25:30 2015 +0300 +++ b/Lib/html/parser.py Sun Sep 06 21:49:48 2015 +0300 @@ -139,7 +139,15 @@ if self.convert_charrefs and not self.cdata_elem: j = rawdata.find('<', i) if j < 0: - if not end: + # if we can't find the next <, either we are at the end + # or there's more text incoming. If the latter is True, + # we can't pass the text to handle_data in case we have + # a charref cut in half at end. Try to determine if + # this is the case before proceding by looking for an + # & near the end and see if it's followed by a space or ;. + amppos = rawdata.rfind('&', max(i, n-34)) + if (amppos >= 0 and + not re.compile(r'[\s;]').search(rawdata, amppos)): break # wait till we get all the text j = n else: diff -r d51a82f68a70 -r 48ae9d66c720 Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py Sun Sep 06 21:25:30 2015 +0300 +++ b/Lib/test/test_htmlparser.py Sun Sep 06 21:49:48 2015 +0300 @@ -72,9 +72,6 @@ class EventCollectorCharrefs(EventCollector): - def get_events(self): - return self.events - def handle_charref(self, data): self.fail('This should never be called with convert_charrefs=True') @@ -633,6 +630,18 @@ ] self._run_check(html, expected) + def test_convert_charrefs_dropped_text(self): + # #23144: make sure that all the events are triggered when + # convert_charrefs is True, even if we don't call .close() + parser = EventCollector(convert_charrefs=True) + # before the fix, bar & baz was missing + parser.feed("foo link bar & baz") + self.assertEqual( + parser.get_events(), + [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'), + ('endtag', 'a'), ('data', ' bar & baz')] + ) + class AttributesTestCase(TestCaseBase): diff -r d51a82f68a70 -r 48ae9d66c720 Misc/NEWS --- a/Misc/NEWS Sun Sep 06 21:25:30 2015 +0300 +++ b/Misc/NEWS Sun Sep 06 21:49:48 2015 +0300 @@ -1,4 +1,4 @@ -+++++++++++ ++++++++++++ Python News +++++++++++ @@ -181,9 +181,13 @@ Library ------- +- Issue #23144: Make sure that HTMLParser.feed() returns all the data, even + when convert_charrefs is True. + - Issue #24635: Fixed a bug in typing.py where isinstance([], typing.Iterable) would return True once, then False on subsequent calls. + - Issue #24989: Fixed buffer overread in BytesIO.readline() if a position is set beyond size. Based on patch by John Leitch.