Skip to content

Commit aa86f4e

Browse files
committed
Handle Content-Encoding: gzip
1 parent 8504ddd commit aa86f4e

File tree

2 files changed

+96
-29
lines changed

2 files changed

+96
-29
lines changed

scrapexpathlist.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env python3
2-
2+
import gzip
33
from typing import Callable, List, Tuple
44
import urllib.request
55
import urllib.error
@@ -114,6 +114,10 @@ def do_fetch(url: str, selector: etree.XPath,
114114
return (None, f'Fetch error: {e.msg}')
115115
except TimeoutError:
116116
return (None, 'HTTP request timed out')
117+
except EOFError:
118+
return (None, 'Compressed data was truncated')
119+
except OSError:
120+
return (None, 'Compressed data was not valid gzip')
117121
except ValueError as e:
118122
return (None, str(e)) # Exceeded max_n_bytes
119123
except UnicodeDecodeError:
@@ -143,6 +147,10 @@ def fetch_text(url: str, max_n_bytes: int=5*1024*1024, timeout: float=30,
143147
144148
Throw `URLError` if anything fails at the HTTP level or below.
145149
150+
Throw `EOFError` if gzip-decoding fails.
151+
152+
Throw `OSError` if gzip-decoding is given non-gzipped data.
153+
146154
Throw `UnicodeDecodeError` if we cannot understand URL's encoding.
147155
"""
148156
# Throws URLError or TimeoutError
@@ -155,5 +163,8 @@ def fetch_text(url: str, max_n_bytes: int=5*1024*1024, timeout: float=30,
155163
f'HTTP response is larger than {max_n_bytes} bytes'
156164
)
157165

166+
if response.info().get('Content-Encoding') == 'gzip':
167+
b = gzip.decompress(b)
168+
158169
text = b.decode(response.info().get_content_charset() or 'utf-8')
159170
return (response.info(), text)

tests/test_scrapexpathlist.py

Lines changed: 84 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,49 @@
11
#!/usr/bin/env python3
22

33
import io
4-
from lxml import etree
5-
from scrapexpathlist import parse_document, select, xpath
64
import unittest
5+
from scrapexpathlist import parse_document, select, xpath, fetch_text
76

87

98
class Xml1(unittest.TestCase):
109
def setUp(self):
1110
self.tree = parse_document(
12-
'<a><b><c>c</c><d foo="bar">d</d></b><b><c>C</c><d foo="baz">D</d></b><e>ehead<f>f</f>etail</e></a>',
11+
(
12+
'<a><b><c>c</c><d foo="bar">d</d></b><b><c>C</c>'
13+
'<d foo="baz">D</d></b><e>ehead<f>f</f>etail</e></a>'
14+
),
1315
False
1416
)
1517

16-
1718
def select(self, selector):
1819
return select(self.tree, xpath(selector))
1920

20-
2121
def test_convert_node_to_text(self):
22-
self.assertEqual(self.select('//c'), [ 'c', 'C' ])
23-
22+
self.assertEqual(self.select('//c'), ['c', 'C'])
2423

2524
def test_convert_subnodes_to_text(self):
26-
self.assertEqual(self.select('//b'), [ 'cd', 'CD' ])
27-
25+
self.assertEqual(self.select('//b'), ['cd', 'CD'])
2826

2927
def test_attributes(self):
30-
self.assertEqual(self.select('//d/@foo'), [ 'bar', 'baz' ])
31-
28+
self.assertEqual(self.select('//d/@foo'), ['bar', 'baz'])
3229

3330
def test_text(self):
34-
self.assertEqual(self.select('//d/text()'), [ 'd', 'D' ])
35-
31+
self.assertEqual(self.select('//d/text()'), ['d', 'D'])
3632

3733
def test_head(self):
38-
self.assertEqual(self.select('//f/preceding-sibling::text()'), [ 'ehead' ])
39-
34+
self.assertEqual(self.select('//f/preceding-sibling::text()'),
35+
['ehead'])
4036

4137
def test_tail(self):
42-
self.assertEqual(self.select('//f/following-sibling::text()'), [ 'etail' ])
43-
38+
self.assertEqual(self.select('//f/following-sibling::text()'),
39+
['etail'])
4440

4541
def test_count(self):
46-
self.assertEqual(self.select('count(//d)'), [ 2.0 ])
47-
42+
self.assertEqual(self.select('count(//d)'), [2.0])
4843

4944
def test_bool(self):
50-
self.assertEqual(self.select('boolean(//f)'), [ True ])
51-
self.assertEqual(self.select('boolean(//g)'), [ False ])
45+
self.assertEqual(self.select('boolean(//f)'), [True])
46+
self.assertEqual(self.select('boolean(//g)'), [False])
5247

5348

5449
class Html1(unittest.TestCase):
@@ -76,20 +71,81 @@ def setUp(self):
7671
True
7772
)
7873

79-
8074
def select(self, selector):
8175
return select(self.tree, xpath(selector))
8276

83-
8477
def test_simple(self):
85-
self.assertEqual(self.select('//p'), [ 'Foo', 'Bar' ])
86-
78+
self.assertEqual(self.select('//p'), ['Foo', 'Bar'])
8779

8880
def test_svg_namespace(self):
8981
# Works across namespaces
90-
self.assertEqual(self.select('//svg:path/@d'), [ 'M0 0L2 2' ])
91-
82+
self.assertEqual(self.select('//svg:path/@d'), ['M0 0L2 2'])
9283

9384
def test_add_missing_elements(self):
9485
# Parse invalid HTML by adding missing elements
95-
self.assertEqual(self.select('//tr'), [ 'Single-cell table' ])
86+
self.assertEqual(self.select('//tr'), ['Single-cell table'])
87+
88+
89+
class FakeResponseInfo:
90+
def __init__(self, headers):
91+
self.headers = headers
92+
93+
def get(self, key, default=None):
94+
return self.headers.get(key, default)
95+
96+
def get_content_charset(self):
97+
parts = self.headers.get('Content-Type', '').split('charset=')
98+
if len(parts) == 2:
99+
return parts[1]
100+
else:
101+
return None
102+
103+
104+
class FakeResponse:
105+
def __init__(self, body, headers):
106+
self.body = io.BytesIO(body)
107+
self._info = FakeResponseInfo(headers)
108+
109+
def __enter__(self):
110+
return self
111+
112+
def __exit__(self, type, value, tb):
113+
pass
114+
115+
def read(self, max_n_bytes):
116+
return self.body.read(max_n_bytes)
117+
118+
def info(self):
119+
return self._info
120+
121+
122+
class FetchTextTest(unittest.TestCase):
123+
def _go(self, response):
124+
return fetch_text('http://example.org',
125+
urlopen=lambda x, **kwargs: response)
126+
127+
def test_default(self):
128+
info, s = self._go(FakeResponse(b'<p>hi</p>', {}))
129+
self.assertEqual(s, '<p>hi</p>')
130+
131+
def test_gzip_encoding(self):
132+
"""Test that Content-Encoding: gzip gets decoded correctly."""
133+
info, s = self._go(FakeResponse(
134+
(b'\x1f\x8b\x08\x00gY\xbe[\x02\xff\xb3)\xb0\xcb'
135+
b'\xc8\xb4\xd1/\xb0\x03\x00e\xd27m\t\x00\x00\x00'),
136+
{'Content-Encoding': 'gzip'}
137+
))
138+
139+
self.assertEqual(s, '<p>hi</p>')
140+
141+
def test_gzip_eoferror(self):
142+
with self.assertRaises(EOFError):
143+
self._go(FakeResponse(
144+
(b'\x1f\x8b\x08\x00gY\xbe[\x02\xff\xb3)\xb0\xcb'
145+
b'\xc8\xb4\xd1/\xb0\x03\x00'),
146+
{'Content-Encoding': 'gzip'}
147+
))
148+
149+
def test_gzip_magic_number_error(self):
150+
with self.assertRaises(OSError):
151+
self._go(FakeResponse(b'<p>hi</p>', {'Content-Encoding': 'gzip'}))

0 commit comments

Comments
 (0)