|  | 
| 26 | 26 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
| 27 | 27 | # POSSIBILITY OF SUCH DAMAGE. | 
| 28 | 28 | 
 | 
| 29 |  | -import cgi | 
| 30 | 29 | import codecs | 
| 31 | 30 | import re | 
|  | 31 | +import typing as t | 
| 32 | 32 | 
 | 
| 33 | 33 | try: | 
| 34 | 34 |  try: | 
| @@ -68,6 +68,30 @@ def lazy_chardet_encoding(data): | 
| 68 | 68 | RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>') | 
| 69 | 69 | 
 | 
| 70 | 70 | 
 | 
|  | 71 | +def parse_content_type(line: str) -> t.Tuple[str, str]: | 
|  | 72 | + """Parse an HTTP Content-Type header. | 
|  | 73 | +
 | 
|  | 74 | + The return value will be a tuple of strings: | 
|  | 75 | + the MIME type, and the value of the "charset" (if any). | 
|  | 76 | +
 | 
|  | 77 | + This is a custom replacement for Python's cgi.parse_header(). | 
|  | 78 | + The cgi module will be removed in Python 3.13. | 
|  | 79 | + """ | 
|  | 80 | + | 
|  | 81 | + chunks = line.split(";") | 
|  | 82 | + if not chunks: | 
|  | 83 | + return "", "" | 
|  | 84 | + | 
|  | 85 | + mime_type = chunks[0].strip() | 
|  | 86 | + charset_value = "" | 
|  | 87 | + for chunk in chunks[1:]: | 
|  | 88 | + key, _, value = chunk.partition("=") | 
|  | 89 | + if key.strip().lower() == "charset": | 
|  | 90 | + charset_value = value.strip().strip("\"'") | 
|  | 91 | + | 
|  | 92 | + return mime_type, charset_value | 
|  | 93 | + | 
|  | 94 | + | 
| 71 | 95 | def convert_to_utf8(http_headers, data, result): | 
| 72 | 96 |  """Detect and convert the character encoding to UTF-8. | 
| 73 | 97 | 
 | 
| @@ -181,10 +205,7 @@ def convert_to_utf8(http_headers, data, result): | 
| 181 | 205 |  # XML declaration encoding, and HTTP encoding, following the | 
| 182 | 206 |  # heuristic defined in RFC 3023. | 
| 183 | 207 |  http_content_type = http_headers.get('content-type') or '' | 
| 184 |  | - http_content_type, params = cgi.parse_header(http_content_type) | 
| 185 |  | - http_encoding = params.get('charset', '').replace("'", "") | 
| 186 |  | - if isinstance(http_encoding, bytes): | 
| 187 |  | - http_encoding = http_encoding.decode('utf-8', 'ignore') | 
|  | 208 | + http_content_type, http_encoding = parse_content_type(http_content_type) | 
| 188 | 209 | 
 | 
| 189 | 210 |  acceptable_content_type = 0 | 
| 190 | 211 |  application_content_types = ('application/xml', 'application/xml-dtd', | 
|  | 
0 commit comments