7979from __future__ import generators
8080
8181__author__ = "Leonard Richardson (leonardr@segfault.org)"
82- __version__ = "3.2.0 "
83- __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
82+ __version__ = "3.2.1 "
83+ __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
8484__license__ = "New-style BSD"
8585
8686from sgmllib import SGMLParser , SGMLParseError
@@ -114,6 +114,21 @@ class PageElement(object):
114114 """Contains the navigational information for some part of the page
115115 (either a tag or a piece of text)"""
116116
117+ def _invert (h ):
118+ "Cheap function to invert a hash."
119+ i = {}
120+ for k ,v in h .items ():
121+ i [v ] = k
122+ return i
123+
124+ XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'" ,
125+ "quot" : '"' ,
126+ "amp" : "&" ,
127+ "lt" : "<" ,
128+ "gt" : ">" }
129+
130+ XML_SPECIAL_CHARS_TO_ENTITIES = _invert (XML_ENTITIES_TO_SPECIAL_CHARS )
131+
117132 def setup (self , parent = None , previous = None ):
118133 """Sets up the initial relations between this element and
119134 other elements."""
@@ -421,6 +436,16 @@ def toEncoding(self, s, encoding=None):
421436 s = unicode (s )
422437 return s
423438
439+ BARE_AMPERSAND_OR_BRACKET = re .compile ("([<>]|"
440+ + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
441+ + ")" )
442+
443+ def _sub_entity (self , x ):
444+ """Used with a regular expression to substitute the
445+ appropriate XML entity for an XML special character."""
446+ return "&" + self .XML_SPECIAL_CHARS_TO_ENTITIES [x .group (0 )[0 ]] + ";"
447+
448+
424449class NavigableString (unicode , PageElement ):
425450
426451 def __new__ (cls , value ):
@@ -451,10 +476,12 @@ def __unicode__(self):
451476 return str (self ).decode (DEFAULT_OUTPUT_ENCODING )
452477
453478 def __str__ (self , encoding = DEFAULT_OUTPUT_ENCODING ):
479+ # Substitute outgoing XML entities.
480+ data = self .BARE_AMPERSAND_OR_BRACKET .sub (self ._sub_entity , self )
454481 if encoding :
455- return self .encode (encoding )
482+ return data .encode (encoding )
456483 else :
457- return self
484+ return data
458485
459486class CData (NavigableString ):
460487
@@ -480,21 +507,6 @@ class Tag(PageElement):
480507
481508 """Represents a found HTML tag with its attributes and contents."""
482509
483- def _invert (h ):
484- "Cheap function to invert a hash."
485- i = {}
486- for k ,v in h .items ():
487- i [v ] = k
488- return i
489-
490- XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'" ,
491- "quot" : '"' ,
492- "amp" : "&" ,
493- "lt" : "<" ,
494- "gt" : ">" }
495-
496- XML_SPECIAL_CHARS_TO_ENTITIES = _invert (XML_ENTITIES_TO_SPECIAL_CHARS )
497-
498510 def _convertEntities (self , match ):
499511 """Used in a call to re.sub to replace HTML, XML, and numeric
500512 entities with the appropriate Unicode characters. If HTML
@@ -681,15 +693,6 @@ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
681693 def __unicode__ (self ):
682694 return self .__str__ (None )
683695
684- BARE_AMPERSAND_OR_BRACKET = re .compile ("([<>]|"
685- + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
686- + ")" )
687-
688- def _sub_entity (self , x ):
689- """Used with a regular expression to substitute the
690- appropriate XML entity for an XML special character."""
691- return "&" + self .XML_SPECIAL_CHARS_TO_ENTITIES [x .group (0 )[0 ]] + ";"
692-
693696 def __str__ (self , encoding = DEFAULT_OUTPUT_ENCODING ,
694697 prettyPrint = False , indentLevel = 0 ):
695698 """Returns a string or Unicode representation of this tag and
0 commit comments