CJWorkbench
diff --git a/‎scrapexpathlist.py‎
Lines changed: 25 additions & 10 deletions b/‎scrapexpathlist.py‎
Lines changed: 25 additions & 10 deletions
diff --git a/‎setup.py‎
Lines changed: 3 additions & 3 deletions b/‎setup.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/test_scrapexpathlist.py‎
Lines changed: 22 additions & 1 deletion b/‎tests/test_scrapexpathlist.py‎
Lines changed: 22 additions & 1 deletion
@@ -1,12 +1,13 @@
 #!/usr/bin/env python3
+
 import gzip
 from typing import Callable, List, Tuple
 import urllib.request
 import urllib.error
 from lxml import etree
 from lxml.html import html5parser
-from pandas import DataFrame
 from http.client import HTTPResponse
+from pandas import DataFrame
 
 
 def fetch(params):
@@ -28,7 +29,8 @@ def fetch(params):
 
 
 def xpath(s: str) -> etree.XPath:
- """Parses an XPath selector, or throws etree.XPathSyntaxError.
+ """
+ Parse an XPath selector, or raise etree.XPathSyntaxError.
 
  A word on namespaces: this module parses HTML without a namespace.
  It parses embedded SVGs in the "svg" namespace. So your XPath
@@ -84,7 +86,10 @@ def _item_to_string(item) -> str:
 
 
 def select(tree: etree._Element, selector: etree.XPath) -> List[str]:
- """Run an xpath expression on `tree` and convert results to strings.
+ """
+ Run an xpath expression on `tree` and convert results to strings.
+
+ Raise XPathEvalError on error.
  """
  # TODO avoid DoS. xpath selectors can take enormous amounts of CPU/memory
  result = selector(tree)
@@ -95,10 +100,13 @@ def select(tree: etree._Element, selector: etree.XPath) -> List[str]:
  return [result]
 
 
-def do_fetch(url: str, selector: etree.XPath,
- urlopen: Callable[[str], HTTPResponse]=urllib.request.urlopen,
- max_n_bytes: int=5*1024*1024,
- timeout: float=30) -> Tuple[DataFrame, str]:
+def do_fetch(
+ url: str,
+ selector: etree.XPath,
+ urlopen: Callable[[str], HTTPResponse] = urllib.request.urlopen,
+ max_n_bytes: int = 5*1024*1024,
+ timeout: float = 30
+) -> Tuple[DataFrame, str]:
  """Open the given URL and selects `selector` xpath, as a
  (DataFrame, error_message) tuple.
 
@@ -127,15 +135,22 @@ def do_fetch(url: str, selector: etree.XPath,
 
  tree = parse_document(text, is_html) # FIXME handle errors
 
- values = select(tree, selector) # FIXME handle errors?
+ try:
+ values = select(tree, selector)
+ except etree.XPathEvalError as err:
+ return (None, 'XPath error: %s' % err)
 
  table = DataFrame({'XPath result': values})
 
  return (table, None)
 
 
-def fetch_text(url: str, max_n_bytes: int=5*1024*1024, timeout: float=30,
- urlopen: Callable[[str], HTTPResponse]=urllib.request.urlopen):
+def fetch_text(
+ url: str,
+ max_n_bytes: int = 5*1024*1024,
+ timeout: float = 30,
+ urlopen: Callable[[str], HTTPResponse] = urllib.request.urlopen
+):
  """Fetch (HTTPResponse.info(), text_content_str) from `url`.
 
  This will never read more than `max_n_bytes` bytes from the response.
 
@@ -9,7 +9,7 @@
  author='Adam Hooper',
  author_email='adam@adamhooper.com',
  url='https://github.com/CJWorkbench/scrape-xpath-list',
- packages=[ '' ],
- py_modules=[ 'scrapexpathlist' ],
- install_requires=[ 'pandas==0.23.0', 'lxml==4.2.1', 'html5lib==1.0.1' ]
+ packages=[''],
+ py_modules=['scrapexpathlist'],
+ install_requires=['pandas==0.24.1', 'lxml==4.2.1', 'html5lib==1.0.1']
 )
@@ -2,7 +2,7 @@
 
 import io
 import unittest
-from scrapexpathlist import parse_document, select, xpath, fetch_text
+from scrapexpathlist import parse_document, select, xpath, fetch_text, do_fetch
 
 
 class Xml1(unittest.TestCase):
@@ -96,6 +96,9 @@ def __init__(self, headers):
  def get(self, key, default=None):
  return self.headers.get(key, default)
 
+ def get_content_type(self):
+ return self.headers.get('Content-Type', '').split(';')[0] or None
+
  def get_content_charset(self):
  parts = self.headers.get('Content-Type', '').split('charset=')
  if len(parts) == 2:
@@ -122,6 +125,24 @@ def info(self):
  return self._info
 
 
+class DoFetchTests(unittest.TestCase):
+ def _go(self, selector, response):
+ return do_fetch(
+ url='http://example.org',
+ selector=selector,
+ urlopen=lambda x, **kwargs: response
+ )
+
+ def test_xpath_eval_error(self):
+ selector = xpath('//ns:a') # valid xpath
+ result = self._go(
+ selector,
+ FakeResponse(b'<p>hi</p>', {'Content-Type': 'text/html'})
+ )
+ self.assertEqual(result,
+ (None, 'XPath error: Undefined namespace prefix'))
+
+
 class FetchTextTest(unittest.TestCase):
  def _go(self, response):
  return fetch_text('http://example.org',