Skip to content

Commit c3376a8

Browse files
committed
Handle XPathEvalError
We saw a case in production where 'div:nth-child(6)' was parsed as valid xpath but couldn't eval. That's a bug in libxpath, sure ... but we _do_ need to handle syntactically-valid xpath that can't eval, so the xpath bug isn't worth pursuing. Fixes warning+error on production Warning: xmlXPathCompOpEval: function nth-child bound to undefined prefix div Error: Exception in scrapexpathlist.fetch Traceback (most recent call last): File "/app/server/models/loaded_module.py", line 207, in fetch out = await future_result File "/usr/local/lib/python3.7/concurrent/futures/thread.py", line 57, in run result = self.fn(*self.args, **self.kwargs) File "/tmp/minio_downloadr4rod_6w", line 33, in fetch File "/tmp/minio_downloadr4rod_6w", line 136, in do_fetch File "/tmp/minio_downloadr4rod_6w", line 96, in select File "src/lxml/xpath.pxi", line 445, in lxml.etree.XPath.__call__ File "src/lxml/xpath.pxi", line 227, in lxml.etree._XPathEvaluatorBase._handle_result lxml.etree.XPathEvalError: Error in xpath expression
1 parent f643be5 commit c3376a8

File tree

3 files changed

+50
-14
lines changed

3 files changed

+50
-14
lines changed

scrapexpathlist.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
#!/usr/bin/env python3
2+
23
import gzip
34
from typing import Callable, List, Tuple
45
import urllib.request
56
import urllib.error
67
from lxml import etree
78
from lxml.html import html5parser
8-
from pandas import DataFrame
99
from http.client import HTTPResponse
10+
from pandas import DataFrame
1011

1112

1213
def fetch(params):
@@ -28,7 +29,8 @@ def fetch(params):
2829

2930

3031
def xpath(s: str) -> etree.XPath:
31-
"""Parses an XPath selector, or throws etree.XPathSyntaxError.
32+
"""
33+
Parse an XPath selector, or raise etree.XPathSyntaxError.
3234
3335
A word on namespaces: this module parses HTML without a namespace.
3436
It parses embedded SVGs in the "svg" namespace. So your XPath
@@ -84,7 +86,10 @@ def _item_to_string(item) -> str:
8486

8587

8688
def select(tree: etree._Element, selector: etree.XPath) -> List[str]:
87-
"""Run an xpath expression on `tree` and convert results to strings.
89+
"""
90+
Run an xpath expression on `tree` and convert results to strings.
91+
92+
Raise XPathEvalError on error.
8893
"""
8994
# TODO avoid DoS. xpath selectors can take enormous amounts of CPU/memory
9095
result = selector(tree)
@@ -95,10 +100,13 @@ def select(tree: etree._Element, selector: etree.XPath) -> List[str]:
95100
return [result]
96101

97102

98-
def do_fetch(url: str, selector: etree.XPath,
99-
urlopen: Callable[[str], HTTPResponse]=urllib.request.urlopen,
100-
max_n_bytes: int=5*1024*1024,
101-
timeout: float=30) -> Tuple[DataFrame, str]:
103+
def do_fetch(
104+
url: str,
105+
selector: etree.XPath,
106+
urlopen: Callable[[str], HTTPResponse] = urllib.request.urlopen,
107+
max_n_bytes: int = 5*1024*1024,
108+
timeout: float = 30
109+
) -> Tuple[DataFrame, str]:
102110
"""Open the given URL and selects `selector` xpath, as a
103111
(DataFrame, error_message) tuple.
104112
@@ -127,15 +135,22 @@ def do_fetch(url: str, selector: etree.XPath,
127135

128136
tree = parse_document(text, is_html) # FIXME handle errors
129137

130-
values = select(tree, selector) # FIXME handle errors?
138+
try:
139+
values = select(tree, selector)
140+
except etree.XPathEvalError as err:
141+
return (None, 'XPath error: %s' % err)
131142

132143
table = DataFrame({'XPath result': values})
133144

134145
return (table, None)
135146

136147

137-
def fetch_text(url: str, max_n_bytes: int=5*1024*1024, timeout: float=30,
138-
urlopen: Callable[[str], HTTPResponse]=urllib.request.urlopen):
148+
def fetch_text(
149+
url: str,
150+
max_n_bytes: int = 5*1024*1024,
151+
timeout: float = 30,
152+
urlopen: Callable[[str], HTTPResponse] = urllib.request.urlopen
153+
):
139154
"""Fetch (HTTPResponse.info(), text_content_str) from `url`.
140155
141156
This will never read more than `max_n_bytes` bytes from the response.

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
author='Adam Hooper',
1010
author_email='adam@adamhooper.com',
1111
url='https://github.com/CJWorkbench/scrape-xpath-list',
12-
packages=[ '' ],
13-
py_modules=[ 'scrapexpathlist' ],
14-
install_requires=[ 'pandas==0.23.0', 'lxml==4.2.1', 'html5lib==1.0.1' ]
12+
packages=[''],
13+
py_modules=['scrapexpathlist'],
14+
install_requires=['pandas==0.24.1', 'lxml==4.2.1', 'html5lib==1.0.1']
1515
)

tests/test_scrapexpathlist.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import io
44
import unittest
5-
from scrapexpathlist import parse_document, select, xpath, fetch_text
5+
from scrapexpathlist import parse_document, select, xpath, fetch_text, do_fetch
66

77

88
class Xml1(unittest.TestCase):
@@ -96,6 +96,9 @@ def __init__(self, headers):
9696
def get(self, key, default=None):
9797
return self.headers.get(key, default)
9898

99+
def get_content_type(self):
100+
return self.headers.get('Content-Type', '').split(';')[0] or None
101+
99102
def get_content_charset(self):
100103
parts = self.headers.get('Content-Type', '').split('charset=')
101104
if len(parts) == 2:
@@ -122,6 +125,24 @@ def info(self):
122125
return self._info
123126

124127

128+
class DoFetchTests(unittest.TestCase):
129+
def _go(self, selector, response):
130+
return do_fetch(
131+
url='http://example.org',
132+
selector=selector,
133+
urlopen=lambda x, **kwargs: response
134+
)
135+
136+
def test_xpath_eval_error(self):
137+
selector = xpath('//ns:a') # valid xpath
138+
result = self._go(
139+
selector,
140+
FakeResponse(b'<p>hi</p>', {'Content-Type': 'text/html'})
141+
)
142+
self.assertEqual(result,
143+
(None, 'XPath error: Undefined namespace prefix'))
144+
145+
125146
class FetchTextTest(unittest.TestCase):
126147
def _go(self, response):
127148
return fetch_text('http://example.org',

0 commit comments

Comments
 (0)