1
1
#!/usr/bin/env python3
2
2
3
- import io
4
- import os
3
+ from typing import Callable , List , Tuple
5
4
import urllib .request
6
5
import urllib .error
7
6
from lxml import etree
8
7
from lxml .html import html5parser
9
8
from pandas import DataFrame
10
- from typing import Callable , List , Tuple
11
9
from http .client import HTTPResponse
12
- import re
13
10
14
11
15
12
def fetch (params ):
16
13
url = params ['url' ]
17
14
selector_string = params ['selector' ]
18
15
19
- if not url : return (None , 'Missing URL' )
20
- if not selector_string : return (None , 'Missing selector' )
16
+ if not url :
17
+ return (None , 'Missing URL' )
18
+
19
+ if not selector_string :
20
+ return (None , 'Missing selector' )
21
21
22
22
try :
23
23
selector = xpath (selector_string )
@@ -40,7 +40,7 @@ def xpath(s: str) -> etree.XPath:
40
40
"""
41
41
return etree .XPath (
42
42
s ,
43
- smart_strings = True , # so result strings don't ref XML doc
43
+ smart_strings = True , # so result strings don't ref XML doc
44
44
namespaces = {
45
45
'svg' : 'http://www.w3.org/2000/svg' ,
46
46
}
@@ -92,10 +92,10 @@ def select(tree: etree._Element, selector: etree.XPath) -> List[str]:
92
92
return list (_item_to_string (item ) for item in result )
93
93
else :
94
94
# count(//a) => float. Return list of float.
95
- return [ result ]
95
+ return [result ]
96
96
97
97
98
- def do_fetch (url : str , selector : etree .XPath ,
98
+ def do_fetch (url : str , selector : etree .XPath ,
99
99
urlopen : Callable [[str ], HTTPResponse ]= urllib .request .urlopen ,
100
100
max_n_bytes : int = 5 * 1024 * 1024 ,
101
101
timeout : float = 30 ) -> Tuple [DataFrame , str ]:
@@ -108,23 +108,24 @@ def do_fetch(url: str, selector: etree.XPath,
108
108
timeout -- number of seconds before we abort
109
109
"""
110
110
try :
111
- (response_info , text ) = fetch_text (url , urlopen = urlopen , timeout = timeout )
111
+ (response_info , text ) = fetch_text (url , urlopen = urlopen ,
112
+ timeout = timeout )
112
113
except urllib .error .URLError as e :
113
114
return (None , f'Fetch error: { e .msg } ' )
114
- except os . TimeoutError :
115
+ except TimeoutError :
115
116
return (None , 'HTTP request timed out' )
116
117
except ValueError as e :
117
- return (None , str (e )) # Exceeded max_n_bytes
118
+ return (None , str (e )) # Exceeded max_n_bytes
118
119
except UnicodeDecodeError :
119
120
return (None , 'HTML or XML has invalid charset' )
120
121
121
122
is_html = response_info .get_content_type () == 'text/html'
122
123
123
- tree = parse_document (text , is_html ) # FIXME handle errors
124
+ tree = parse_document (text , is_html ) # FIXME handle errors
124
125
125
- values = select (tree , selector ) # FIXME handle errors?
126
+ values = select (tree , selector ) # FIXME handle errors?
126
127
127
- table = DataFrame ({ 'XPath result' : values })
128
+ table = DataFrame ({'XPath result' : values })
128
129
129
130
return (table , None )
130
131
@@ -136,21 +137,23 @@ def fetch_text(url: str, max_n_bytes: int=5*1024*1024, timeout: float=30,
136
137
This will never read more than `max_n_bytes` bytes from the response.
137
138
It will also return before `timeout`s expire.
138
139
139
- Throw `os. TimeoutError` if `timeout` expires.
140
+ Throw `TimeoutError` if `timeout` expires.
140
141
141
142
Throw `ValueError` if the `max_n_bytes` is exceeded.
142
143
143
144
Throw `URLError` if anything fails at the HTTP level or below.
144
145
145
146
Throw `UnicodeDecodeError` if we cannot understand URL's encoding.
146
147
"""
147
- # Throws os. URLError or os. TimeoutError
148
+ # Throws URLError or TimeoutError
148
149
with urlopen (url , timeout = timeout ) as response :
149
150
# TODO avoid DoS. The above timeout is the _socket_ timeout: one
150
151
# byte from the server resets it.
151
152
b = response .read (max_n_bytes + 1 )
152
153
if (len (b ) == max_n_bytes + 1 ):
153
- raise ValueError (f'HTTP response is larger than { max_n_bytes } bytes' )
154
+ raise ValueError (
155
+ f'HTTP response is larger than { max_n_bytes } bytes'
156
+ )
154
157
155
158
text = b .decode (response .info ().get_content_charset () or 'utf-8' )
156
159
return (response .info (), text )
0 commit comments