|
1 | 1 | #!/usr/bin/env python3
|
2 | 2 |
|
3 | 3 | import io
|
4 |
| -from lxml import etree |
5 |
| -from scrapexpathlist import parse_document, select, xpath |
6 | 4 | import unittest
|
| 5 | +from scrapexpathlist import parse_document, select, xpath, fetch_text |
7 | 6 |
|
8 | 7 |
|
9 | 8 | class Xml1(unittest.TestCase):
|
10 | 9 | def setUp(self):
|
11 | 10 | self.tree = parse_document(
|
12 |
| - '<a><b><c>c</c><d foo="bar">d</d></b><b><c>C</c><d foo="baz">D</d></b><e>ehead<f>f</f>etail</e></a>', |
| 11 | + ( |
| 12 | + '<a><b><c>c</c><d foo="bar">d</d></b><b><c>C</c>' |
| 13 | + '<d foo="baz">D</d></b><e>ehead<f>f</f>etail</e></a>' |
| 14 | + ), |
13 | 15 | False
|
14 | 16 | )
|
15 | 17 |
|
16 |
| - |
17 | 18 | def select(self, selector):
|
18 | 19 | return select(self.tree, xpath(selector))
|
19 | 20 |
|
20 |
| - |
21 | 21 | def test_convert_node_to_text(self):
|
22 |
| - self.assertEqual(self.select('//c'), [ 'c', 'C' ]) |
23 |
| - |
| 22 | + self.assertEqual(self.select('//c'), ['c', 'C']) |
24 | 23 |
|
25 | 24 | def test_convert_subnodes_to_text(self):
|
26 |
| - self.assertEqual(self.select('//b'), [ 'cd', 'CD' ]) |
27 |
| - |
| 25 | + self.assertEqual(self.select('//b'), ['cd', 'CD']) |
28 | 26 |
|
29 | 27 | def test_attributes(self):
|
30 |
| - self.assertEqual(self.select('//d/@foo'), [ 'bar', 'baz' ]) |
31 |
| - |
| 28 | + self.assertEqual(self.select('//d/@foo'), ['bar', 'baz']) |
32 | 29 |
|
33 | 30 | def test_text(self):
|
34 |
| - self.assertEqual(self.select('//d/text()'), [ 'd', 'D' ]) |
35 |
| - |
| 31 | + self.assertEqual(self.select('//d/text()'), ['d', 'D']) |
36 | 32 |
|
37 | 33 | def test_head(self):
|
38 |
| - self.assertEqual(self.select('//f/preceding-sibling::text()'), [ 'ehead' ]) |
39 |
| - |
| 34 | + self.assertEqual(self.select('//f/preceding-sibling::text()'), |
| 35 | + ['ehead']) |
40 | 36 |
|
41 | 37 | def test_tail(self):
|
42 |
| - self.assertEqual(self.select('//f/following-sibling::text()'), [ 'etail' ]) |
43 |
| - |
| 38 | + self.assertEqual(self.select('//f/following-sibling::text()'), |
| 39 | + ['etail']) |
44 | 40 |
|
45 | 41 | def test_count(self):
|
46 |
| - self.assertEqual(self.select('count(//d)'), [ 2.0 ]) |
47 |
| - |
| 42 | + self.assertEqual(self.select('count(//d)'), [2.0]) |
48 | 43 |
|
49 | 44 | def test_bool(self):
|
50 |
| - self.assertEqual(self.select('boolean(//f)'), [ True ]) |
51 |
| - self.assertEqual(self.select('boolean(//g)'), [ False ]) |
| 45 | + self.assertEqual(self.select('boolean(//f)'), [True]) |
| 46 | + self.assertEqual(self.select('boolean(//g)'), [False]) |
52 | 47 |
|
53 | 48 |
|
54 | 49 | class Html1(unittest.TestCase):
|
@@ -76,20 +71,81 @@ def setUp(self):
|
76 | 71 | True
|
77 | 72 | )
|
78 | 73 |
|
79 |
| - |
80 | 74 | def select(self, selector):
|
81 | 75 | return select(self.tree, xpath(selector))
|
82 | 76 |
|
83 |
| - |
84 | 77 | def test_simple(self):
|
85 |
| - self.assertEqual(self.select('//p'), [ 'Foo', 'Bar' ]) |
86 |
| - |
| 78 | + self.assertEqual(self.select('//p'), ['Foo', 'Bar']) |
87 | 79 |
|
88 | 80 | def test_svg_namespace(self):
|
89 | 81 | # Works across namespaces
|
90 |
| - self.assertEqual(self.select('//svg:path/@d'), [ 'M0 0L2 2' ]) |
91 |
| - |
| 82 | + self.assertEqual(self.select('//svg:path/@d'), ['M0 0L2 2']) |
92 | 83 |
|
93 | 84 | def test_add_missing_elements(self):
|
94 | 85 | # Parse invalid HTML by adding missing elements
|
95 |
| - self.assertEqual(self.select('//tr'), [ 'Single-cell table' ]) |
| 86 | + self.assertEqual(self.select('//tr'), ['Single-cell table']) |
| 87 | + |
| 88 | + |
| 89 | +class FakeResponseInfo: |
| 90 | + def __init__(self, headers): |
| 91 | + self.headers = headers |
| 92 | + |
| 93 | + def get(self, key, default=None): |
| 94 | + return self.headers.get(key, default) |
| 95 | + |
| 96 | + def get_content_charset(self): |
| 97 | + parts = self.headers.get('Content-Type', '').split('charset=') |
| 98 | + if len(parts) == 2: |
| 99 | + return parts[1] |
| 100 | + else: |
| 101 | + return None |
| 102 | + |
| 103 | + |
| 104 | +class FakeResponse: |
| 105 | + def __init__(self, body, headers): |
| 106 | + self.body = io.BytesIO(body) |
| 107 | + self._info = FakeResponseInfo(headers) |
| 108 | + |
| 109 | + def __enter__(self): |
| 110 | + return self |
| 111 | + |
| 112 | + def __exit__(self, type, value, tb): |
| 113 | + pass |
| 114 | + |
| 115 | + def read(self, max_n_bytes): |
| 116 | + return self.body.read(max_n_bytes) |
| 117 | + |
| 118 | + def info(self): |
| 119 | + return self._info |
| 120 | + |
| 121 | + |
| 122 | +class FetchTextTest(unittest.TestCase): |
| 123 | + def _go(self, response): |
| 124 | + return fetch_text('http://example.org', |
| 125 | + urlopen=lambda x, **kwargs: response) |
| 126 | + |
| 127 | + def test_default(self): |
| 128 | + info, s = self._go(FakeResponse(b'<p>hi</p>', {})) |
| 129 | + self.assertEqual(s, '<p>hi</p>') |
| 130 | + |
| 131 | + def test_gzip_encoding(self): |
| 132 | + """Test that Content-Encoding: gzip gets decoded correctly.""" |
| 133 | + info, s = self._go(FakeResponse( |
| 134 | + (b'\x1f\x8b\x08\x00gY\xbe[\x02\xff\xb3)\xb0\xcb' |
| 135 | + b'\xc8\xb4\xd1/\xb0\x03\x00e\xd27m\t\x00\x00\x00'), |
| 136 | + {'Content-Encoding': 'gzip'} |
| 137 | + )) |
| 138 | + |
| 139 | + self.assertEqual(s, '<p>hi</p>') |
| 140 | + |
| 141 | + def test_gzip_eoferror(self): |
| 142 | + with self.assertRaises(EOFError): |
| 143 | + self._go(FakeResponse( |
| 144 | + (b'\x1f\x8b\x08\x00gY\xbe[\x02\xff\xb3)\xb0\xcb' |
| 145 | + b'\xc8\xb4\xd1/\xb0\x03\x00'), |
| 146 | + {'Content-Encoding': 'gzip'} |
| 147 | + )) |
| 148 | + |
| 149 | + def test_gzip_magic_number_error(self): |
| 150 | + with self.assertRaises(OSError): |
| 151 | + self._go(FakeResponse(b'<p>hi</p>', {'Content-Encoding': 'gzip'})) |
0 commit comments