Skip to content

Commit 240f381

Browse files
HemangChothanitswast
authored andcommitted
feat(bigquery): add RowIterator.to_dataframe_iterable method to get pandas DataFrame per page (#10017)
* feat(bigquery): make rowIterator._to_dataframe_iterable public * feat(bigquery): cosmetic changes and unittest change * feat(bigquery): change as per comment
1 parent 3bb565b commit 240f381

File tree

2 files changed

+98
-3
lines changed

2 files changed

+98
-3
lines changed

bigquery/google/cloud/bigquery/table.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,11 +1554,44 @@ def to_arrow(
15541554
arrow_schema = _pandas_helpers.bq_to_arrow_schema(self._schema)
15551555
return pyarrow.Table.from_batches(record_batches, schema=arrow_schema)
15561556

1557-
def _to_dataframe_iterable(self, bqstorage_client=None, dtypes=None):
1557+
def to_dataframe_iterable(self, bqstorage_client=None, dtypes=None):
15581558
"""Create an iterable of pandas DataFrames, to process the table as a stream.
15591559
1560-
See ``to_dataframe`` for argument descriptions.
1560+
Args:
1561+
bqstorage_client (google.cloud.bigquery_storage_v1beta1.BigQueryStorageClient):
1562+
**Beta Feature** Optional. A BigQuery Storage API client. If
1563+
supplied, use the faster BigQuery Storage API to fetch rows
1564+
from BigQuery.
1565+
1566+
This method requires the ``pyarrow`` and
1567+
``google-cloud-bigquery-storage`` libraries.
1568+
1569+
Reading from a specific partition or snapshot is not
1570+
currently supported by this method.
1571+
1572+
**Caution**: There is a known issue reading small anonymous
1573+
query result tables with the BQ Storage API. When a problem
1574+
is encountered reading a table, the tabledata.list method
1575+
from the BigQuery API is used, instead.
1576+
dtypes (Map[str, Union[str, pandas.Series.dtype]]):
1577+
Optional. A dictionary of column names pandas ``dtype``s. The
1578+
provided ``dtype`` is used when constructing the series for
1579+
the column specified. Otherwise, the default pandas behavior
1580+
is used.
1581+
1582+
Returns:
1583+
pandas.DataFrame:
1584+
A generator of :class:`~pandas.DataFrame`.
1585+
1586+
Raises:
1587+
ValueError:
1588+
If the :mod:`pandas` library cannot be imported.
15611589
"""
1590+
if pandas is None:
1591+
raise ValueError(_NO_PANDAS_ERROR)
1592+
if dtypes is None:
1593+
dtypes = {}
1594+
15621595
column_names = [field.name for field in self._schema]
15631596
bqstorage_download = functools.partial(
15641597
_pandas_helpers.download_dataframe_bqstorage,
@@ -1683,7 +1716,7 @@ def to_dataframe(
16831716
progress_bar = self._get_progress_bar(progress_bar_type)
16841717

16851718
frames = []
1686-
for frame in self._to_dataframe_iterable(
1719+
for frame in self.to_dataframe_iterable(
16871720
bqstorage_client=bqstorage_client, dtypes=dtypes
16881721
):
16891722
frames.append(frame)

bigquery/tests/unit/test_table.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2014,6 +2014,68 @@ def test_to_arrow_w_pyarrow_none(self):
20142014
with self.assertRaises(ValueError):
20152015
row_iterator.to_arrow()
20162016

2017+
@unittest.skipIf(pandas is None, "Requires `pandas`")
2018+
def test_to_dataframe_iterable(self):
2019+
from google.cloud.bigquery.schema import SchemaField
2020+
import types
2021+
2022+
schema = [
2023+
SchemaField("name", "STRING", mode="REQUIRED"),
2024+
SchemaField("age", "INTEGER", mode="REQUIRED"),
2025+
]
2026+
2027+
path = "/foo"
2028+
api_request = mock.Mock(
2029+
side_effect=[
2030+
{
2031+
"rows": [{"f": [{"v": "Bengt"}, {"v": "32"}]}],
2032+
"pageToken": "NEXTPAGE",
2033+
},
2034+
{"rows": [{"f": [{"v": "Sven"}, {"v": "33"}]}]},
2035+
]
2036+
)
2037+
2038+
row_iterator = self._make_one(
2039+
_mock_client(), api_request, path, schema, page_size=1, max_results=5
2040+
)
2041+
dfs = row_iterator.to_dataframe_iterable()
2042+
2043+
self.assertIsInstance(dfs, types.GeneratorType)
2044+
2045+
df_1 = next(dfs)
2046+
self.assertIsInstance(df_1, pandas.DataFrame)
2047+
self.assertEqual(df_1.name.dtype.name, "object")
2048+
self.assertEqual(df_1.age.dtype.name, "int64")
2049+
self.assertEqual(len(df_1), 1) # verify the number of rows
2050+
self.assertEqual(
2051+
df_1["name"][0], "Bengt"
2052+
) # verify the first value of 'name' column
2053+
self.assertEqual(df_1["age"][0], 32) # verify the first value of 'age' column
2054+
2055+
df_2 = next(dfs)
2056+
self.assertEqual(len(df_2), 1) # verify the number of rows
2057+
self.assertEqual(df_2["name"][0], "Sven")
2058+
self.assertEqual(df_2["age"][0], 33)
2059+
2060+
@mock.patch("google.cloud.bigquery.table.pandas", new=None)
2061+
def test_to_dataframe_iterable_error_if_pandas_is_none(self):
2062+
from google.cloud.bigquery.schema import SchemaField
2063+
2064+
schema = [
2065+
SchemaField("name", "STRING", mode="REQUIRED"),
2066+
SchemaField("age", "INTEGER", mode="REQUIRED"),
2067+
]
2068+
rows = [
2069+
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
2070+
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
2071+
]
2072+
path = "/foo"
2073+
api_request = mock.Mock(return_value={"rows": rows})
2074+
row_iterator = self._make_one(_mock_client(), api_request, path, schema)
2075+
2076+
with pytest.raises(ValueError, match="pandas"):
2077+
row_iterator.to_dataframe_iterable()
2078+
20172079
@unittest.skipIf(pandas is None, "Requires `pandas`")
20182080
def test_to_dataframe(self):
20192081
from google.cloud.bigquery.schema import SchemaField

0 commit comments

Comments
 (0)