Skip to content

Commit 78e3e52

Browse files
committed
Newer version of bs3
1 parent 427abbc commit 78e3e52

File tree

1 file changed

+31
-28
lines changed

1 file changed

+31
-28
lines changed

thirdparty/beautifulsoup/beautifulsoup.py

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@
7979
from __future__ import generators
8080

8181
__author__ = "Leonard Richardson (leonardr@segfault.org)"
82-
__version__ = "3.2.0"
83-
__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
82+
__version__ = "3.2.1"
83+
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
8484
__license__ = "New-style BSD"
8585

8686
from sgmllib import SGMLParser, SGMLParseError
@@ -114,6 +114,21 @@ class PageElement(object):
114114
"""Contains the navigational information for some part of the page
115115
(either a tag or a piece of text)"""
116116

117+
def _invert(h):
118+
"Cheap function to invert a hash."
119+
i = {}
120+
for k,v in h.items():
121+
i[v] = k
122+
return i
123+
124+
XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
125+
"quot" : '"',
126+
"amp" : "&",
127+
"lt" : "<",
128+
"gt" : ">" }
129+
130+
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
131+
117132
def setup(self, parent=None, previous=None):
118133
"""Sets up the initial relations between this element and
119134
other elements."""
@@ -421,6 +436,16 @@ def toEncoding(self, s, encoding=None):
421436
s = unicode(s)
422437
return s
423438

439+
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
440+
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
441+
+ ")")
442+
443+
def _sub_entity(self, x):
444+
"""Used with a regular expression to substitute the
445+
appropriate XML entity for an XML special character."""
446+
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
447+
448+
424449
class NavigableString(unicode, PageElement):
425450

426451
def __new__(cls, value):
@@ -451,10 +476,12 @@ def __unicode__(self):
451476
return str(self).decode(DEFAULT_OUTPUT_ENCODING)
452477

453478
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
479+
# Substitute outgoing XML entities.
480+
data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
454481
if encoding:
455-
return self.encode(encoding)
482+
return data.encode(encoding)
456483
else:
457-
return self
484+
return data
458485

459486
class CData(NavigableString):
460487

@@ -480,21 +507,6 @@ class Tag(PageElement):
480507

481508
"""Represents a found HTML tag with its attributes and contents."""
482509

483-
def _invert(h):
484-
"Cheap function to invert a hash."
485-
i = {}
486-
for k,v in h.items():
487-
i[v] = k
488-
return i
489-
490-
XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
491-
"quot" : '"',
492-
"amp" : "&",
493-
"lt" : "<",
494-
"gt" : ">" }
495-
496-
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
497-
498510
def _convertEntities(self, match):
499511
"""Used in a call to re.sub to replace HTML, XML, and numeric
500512
entities with the appropriate Unicode characters. If HTML
@@ -681,15 +693,6 @@ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
681693
def __unicode__(self):
682694
return self.__str__(None)
683695

684-
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
685-
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
686-
+ ")")
687-
688-
def _sub_entity(self, x):
689-
"""Used with a regular expression to substitute the
690-
appropriate XML entity for an XML special character."""
691-
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
692-
693696
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
694697
prettyPrint=False, indentLevel=0):
695698
"""Returns a string or Unicode representation of this tag and

0 commit comments

Comments
 (0)