Skip to content
Prev Previous commit
Next Next commit
Do not nix rows of empty
... but _ignore_ empty rows when inferring columns. This changes the behavior of test_spam_header, which previously ignored an empty row when the user explicitly stated the row number to use as header.
  • Loading branch information
adamhooper committed Jun 27, 2018
commit 6fa04896dd763d86f98bc1efd3adb54e646de441
18 changes: 10 additions & 8 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,11 +496,7 @@ def _expand_colspan_rowspan(self, rows):
all_texts.append(texts)
remainder = next_remainder

# ignore all-empty-text rows
no_empty = [row for row in all_texts
if any(text for text in row)]

return no_empty
return all_texts

def _handle_hidden_tables(self, tbl_list, attr_name):
"""
Expand Down Expand Up @@ -785,10 +781,16 @@ def _data_to_frame(**kwargs):
header = kwargs.pop('header')
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
if head:
rows = lrange(len(head))
body = head + body
if header is None: # special case when a table has <th> elements
header = 0 if rows == [0] else rows

# Infer header when there is a <thead> or top <th>-only rows
if header is None:
if len(head) == 1:
header = 0
else:
# ignore all-empty-text rows
header = [i for i, row in enumerate(head)
if any(text for text in row)]

if foot:
body += foot
Expand Down
125 changes: 82 additions & 43 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
date_range, Series)
from pandas.compat import (map, zip, StringIO, BytesIO,
is_platform_windows, PY3, reload)
from pandas.errors import ParserError
from pandas.io.common import URLError, file_path_to_url
import pandas.io.html
from pandas.io.html import read_html
Expand Down Expand Up @@ -147,7 +148,7 @@ def test_banklist_no_match(self):
assert isinstance(df, DataFrame)

def test_spam_header(self):
df = self.read_html(self.spam_data, '.*Water.*', header=1)[0]
df = self.read_html(self.spam_data, '.*Water.*', header=2)[0]
assert df.columns[0] == 'Proximates'
assert not df.empty

Expand Down Expand Up @@ -424,7 +425,7 @@ def test_multiple_tbody(self):
</tbody>
</table>''')[0]

expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B'])

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -471,11 +472,8 @@ def test_thead_without_tr(self):
</tbody>
</table>''')[0]

expected = DataFrame(data={
'Country': ['Ukraine'],
'Municipality': ['Odessa'],
'Year': [1944],
})
expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]],
columns=['Country', 'Municipality', 'Year'])

tm.assert_frame_equal(result, expected)

Expand All @@ -502,9 +500,10 @@ def test_tfoot_read(self):
</tfoot>
</table>'''

expected1 = DataFrame({'A': ['bodyA'], 'B': ['bodyB']})
expected2 = DataFrame({'A': ['bodyA', 'footA'],
'B': ['bodyB', 'footB']})
expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B'])

expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']],
columns=['A', 'B'])

data1 = data_template.format(footer="")
data2 = data_template.format(
Expand Down Expand Up @@ -532,7 +531,7 @@ def test_parse_header_of_non_string_column(self):
</table>
''', header=0)[0]

expected = DataFrame(data={'S': ['text'], 'I': [1944]})
expected = DataFrame([['text', 1944]], columns=('S', 'I'))

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -676,11 +675,7 @@ def test_colspan_rowspan_1(self):
</table>
""")[0]

expected = DataFrame(data={
'A': ['a'],
'B': ['b'],
'C': ['c'],
})
expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C'])

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -708,13 +703,8 @@ def test_colspan_rowspan_copy_values(self):
</table>
""", header=0)[0]

expected = DataFrame(data={
'X': ['A'],
'X.1': ['B'],
'Y': ['B'],
'Z': ['Z'],
'W': ['C'],
})
expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']],
columns=['X', 'X.1', 'Y', 'Z', 'W'])

tm.assert_frame_equal(result, expected)

Expand All @@ -739,13 +729,8 @@ def test_colspan_rowspan_both_not_1(self):
</table>
""", header=0)[0]

expected = DataFrame(data={
'A': ['A'],
'B': ['B'],
'B.1': ['B'],
'B.2': ['B'],
'C': ['D'],
})
expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']],
columns=['A', 'B', 'B.1', 'B.2', 'C'])

tm.assert_frame_equal(result, expected)

Expand All @@ -769,10 +754,7 @@ def test_rowspan_at_end_of_row(self):
</table>
""", header=0)[0]

expected = DataFrame(data={
'A': ['C'],
'B': ['B']
})
expected = DataFrame(data=[['C', 'B']], columns=['A', 'B'])

tm.assert_frame_equal(result, expected)

Expand All @@ -788,14 +770,12 @@ def test_rowspan_only_rows(self):
</table>
""", header=0)[0]

expected = DataFrame(data={
'A': ['A', 'A'],
'B': ['B', 'B'],
})
expected = DataFrame(data=[['A', 'B'], ['A', 'B']],
columns=['A', 'B'])

tm.assert_frame_equal(result, expected)

def test_header_inferred_from_th_elements(self):
def test_header_inferred_from_rows_with_only_th(self):
# GH17054
result = self.read_html("""
<table>
Expand All @@ -814,10 +794,9 @@ def test_header_inferred_from_th_elements(self):
</table>
""")[0]

expected = DataFrame(data={
('A', 'a'): [1],
('B', 'b'): [2],
})
columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
labels=[[0, 1], [0, 1]])
expected = DataFrame(data=[[1, 2]], columns=columns)

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -856,6 +835,23 @@ def test_wikipedia_states_table(self, datapath):
result = self.read_html(data, 'Arizona', header=1)[0]
assert result['sq mi'].dtype == np.dtype('float64')

def test_parser_error_on_empty_header_row(self):
with tm.assert_raises_regex(ParserError,
r"Passed header=\[0,1\] are "
r"too many rows for this "
r"multi_index of columns"):
self.read_html("""
<table>
<thead>
<tr><th></th><th></tr>
<tr><th>A</th><th>B</th></tr>
</thead>
<tbody>
<tr><td>a</td><td>b</td></tr>
</tbody>
</table>
""", header=[0, 1])

def test_decimal_rows(self):
# GH 12907
result = self.read_html('''<html>
Expand Down Expand Up @@ -960,6 +956,49 @@ def test_keep_default_na(self):
html_df = self.read_html(html_data, keep_default_na=True)[0]
tm.assert_frame_equal(expected_df, html_df)

def test_preserve_empty_rows(self):
result = self.read_html("""
<table>
<tr>
<th>A</th>
<th>B</th>
</tr>
<tr>
<td>a</td>
<td>b</td>
</tr>
<tr>
<td></td>
<td></td>
</tr>
</table>
""")[0]

expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]],
columns=['A', 'B'])

tm.assert_frame_equal(result, expected)

def test_ignore_empty_rows_when_inferring_header(self):
result = self.read_html("""
<table>
<thead>
<tr><th></th><th></tr>
<tr><th>A</th><th>B</th></tr>
<tr><th>a</th><th>b</th></tr>
</thead>
<tbody>
<tr><td>1</td><td>2</td></tr>
</tbody>
</table>
""")[0]

columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
labels=[[0, 1], [0, 1]])
expected = DataFrame(data=[[1, 2]], columns=columns)

tm.assert_frame_equal(result, expected)

def test_multiple_header_rows(self):
# Issue #13434
expected_df = DataFrame(data=[("Hillary", 68, "D"),
Expand Down