allisonwang-db
diff --git a/‎docs/datasources/kaggle.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/datasources/kaggle.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/index.md‎
Lines changed: 9 additions & 8 deletions b/‎docs/index.md‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions b/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎poetry.lock‎
Lines changed: 178 additions & 4 deletions b/‎poetry.lock‎
Lines changed: 178 additions & 4 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎pyspark_datasources/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎pyspark_datasources/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyspark_datasources/kaggle.py‎
Lines changed: 112 additions & 0 deletions b/‎pyspark_datasources/kaggle.py‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎tests/test_data_sources.py‎
Lines changed: 8 additions & 0 deletions b/‎tests/test_data_sources.py‎
Lines changed: 8 additions & 0 deletions
@@ -0,0 +1,5 @@
+# KaggleDataSource
+
+> Requires the [`kagglehub`](https://github.com/Kaggle/kagglehub) library.
+
+::: pyspark_datasources.kaggle.KaggleDataSource
@@ -28,11 +28,12 @@ spark.read.format("github").load("apache/spark").show()
 
 ## Data Sources
 
-| Data Source | Short Name | Description | Dependencies |
-| ------------------------------------------------------- | -------------- | --------------------------------------------- | ---------------- |
-| [GithubDataSource](./datasources/github.md) | `github` | Read pull requests from a Github repository | None |
-| [FakeDataSource](./datasources/fake.md) | `fake` | Generate fake data using the `Faker` library | `faker` |
-| [HuggingFaceDatasets](./datasources/huggingface.md) | `huggingface` | Read datasets from the HuggingFace Hub | `datasets` |
-| [StockDataSource](./datasources/stock.md) | `stock` | Read stock data from Alpha Vantage | None |
-| [SimpleJsonDataSource](./datasources/simplejson.md) | `simplejson` | Read JSON data from a file | `databricks-sdk` |
-| [GoogleSheetsDataSource](./datasources/googlesheets.md) | `googlesheets` | Read table from public Google Sheets document | None |
+| Data Source | Short Name | Description | Dependencies |
+| ------------------------------------------------------- | -------------- | --------------------------------------------- | --------------------- |
+| [GithubDataSource](./datasources/github.md) | `github` | Read pull requests from a Github repository | None |
+| [FakeDataSource](./datasources/fake.md) | `fake` | Generate fake data using the `Faker` library | `faker` |
+| [HuggingFaceDatasets](./datasources/huggingface.md) | `huggingface` | Read datasets from the HuggingFace Hub | `datasets` |
+| [StockDataSource](./datasources/stock.md) | `stock` | Read stock data from Alpha Vantage | None |
+| [SimpleJsonDataSource](./datasources/simplejson.md) | `simplejson` | Read JSON data from a file | `databricks-sdk` |
+| [GoogleSheetsDataSource](./datasources/googlesheets.md) | `googlesheets` | Read table from public Google Sheets document | None |
+| [KaggleDataSource](./datasources/kaggle.md) | `kaggle` | Read datasets from Kaggle | `kagglehub`, `pandas` |
@@ -24,6 +24,7 @@ nav:
  - datasources/stock.md
  - datasources/simplejson.md
  - datasources/googlesheets.md
+ - datasources/kaggle.md
 
 markdown_extensions:
  - pymdownx.highlight:
 
@@ -11,25 +11,27 @@ packages = [
 
 [tool.poetry.dependencies]
 python = ">=3.9,<=3.12"
+pyarrow = ">=11.0.0"
 requests = "^2.31.0"
 faker = {version = "^23.1.0", optional = true}
 mkdocstrings = {extras = ["python"], version = "^0.24.0"}
 datasets = {version = "^2.17.0", optional = true}
 databricks-sdk = {version = "^0.28.0", optional = true}
+kagglehub = {extras = ["pandas-datasets"], version = "^0.3.10", optional = true}
 
 [tool.poetry.extras]
 faker = ["faker"]
 datasets = ["datasets"]
 databricks = ["databricks-sdk"]
+kaggle = ["kagglehub"]
 lance = ["pylance"]
-all = ["faker", "datasets", "databricks"]
+all = ["faker", "datasets", "databricks-sdk", "kagglehub"]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.0.0"
 grpcio = "^1.60.1"
 grpcio-status = "^1.60.1"
 pandas = "^2.2.0"
-pyarrow = "^15.0.0"
 mkdocs-material = "^9.5.9"
 
 [build-system]
 
@@ -2,5 +2,6 @@
 from .github import GithubDataSource
 from .googlesheets import GoogleSheetsDataSource
 from .huggingface import HuggingFaceDatasets
+from .kaggle import KaggleDataSource
 from .simplejson import SimpleJsonDataSource
 from .stock import StockDataSource
@@ -0,0 +1,112 @@
+import tempfile
+from functools import cached_property
+from typing import TYPE_CHECKING, Iterator
+
+from pyspark.sql.datasource import DataSource, DataSourceReader
+from pyspark.sql.pandas.types import from_arrow_schema
+from pyspark.sql.types import StructType
+
+if TYPE_CHECKING:
+ import pyarrow as pa
+
+
+class KaggleDataSource(DataSource):
+ """
+ A DataSource for reading Kaggle datasets in Spark.
+
+ This data source allows reading datasets from Kaggle directly into Spark DataFrames.
+
+ Name: `kaggle`
+
+ Options
+ -------
+ - `handle`: The dataset handle on Kaggle, in the form of `{owner_slug}/{dataset_slug}`
+ or `{owner_slug}/{dataset_slug}/versions/{version_number}`
+ - `path`: The path to a file within the dataset.
+ - `username`: The Kaggle username for authentication.
+ - `key`: The Kaggle API key for authentication.
+
+ Notes:
+ -----
+ - The `kagglehub` library is required to use this data source. Make sure it is installed.
+ - To read private datasets or datasets that require user authentication, `username` and `key` must be provided.
+ - Currently all data is read from a single partition.
+
+ Examples
+ --------
+ Register the data source.
+
+ >>> from pyspark_datasources import KaggleDataSource
+ >>> spark.dataSource.register(KaggleDataSource)
+
+ Load a public dataset from Kaggle.
+
+ >>> spark.read.format("kaggle").options(handle="yasserh/titanic-dataset").load("Titanic-Dataset.csv").select("Name").show()
+ +--------------------+
+ | Name|
+ +--------------------+
+ |Braund, Mr. Owen ...|
+ |Cumings, Mrs. Joh...|
+ |... |
+ +--------------------+
+
+ Load a private dataset with authentication.
+
+ >>> spark.read.format("kaggle").options(
+ ... username="myaccount",
+ ... key="<token>",
+ ... handle="myaccount/my-private-dataset",
+ ... ).load("file.csv").show()
+ """
+
+ @classmethod
+ def name(cls) -> str:
+ return "kaggle"
+
+ @cached_property
+ def _data(self) -> "pa.Table":
+ import ast
+ import os
+
+ import pyarrow as pa
+
+ handle = self.options.pop("handle")
+ path = self.options.pop("path")
+ username = self.options.pop("username", None)
+ key = self.options.pop("key", None)
+ if username or key:
+ if not (username and key):
+ raise ValueError(
+ "Both username and key must be provided to authenticate."
+ )
+ os.environ["KAGGLE_USERNAME"] = username
+ os.environ["KAGGLE_KEY"] = key
+
+ kwargs = {k: ast.literal_eval(v) for k, v in self.options.items()}
+
+ # Cache in a temporary directory to avoid writing to ~ which may be read-only
+ with tempfile.TemporaryDirectory() as tmpdir:
+ os.environ["KAGGLEHUB_CACHE"] = tmpdir
+ import kagglehub
+
+ df = kagglehub.dataset_load(
+ kagglehub.KaggleDatasetAdapter.PANDAS,
+ handle,
+ path,
+ **kwargs,
+ )
+ return pa.Table.from_pandas(df)
+
+ def schema(self) -> StructType:
+ return from_arrow_schema(self._data.schema)
+
+ def reader(self, schema: StructType) -> "KaggleDataReader":
+ return KaggleDataReader(self)
+
+
+class KaggleDataReader(DataSourceReader):
+ def __init__(self, source: KaggleDataSource):
+ self.source = source
+
+ def read(self, partition) -> Iterator["pa.RecordBatch"]:
+ yield from self.source._data.to_batches()
@@ -23,3 +23,11 @@ def test_fake_datasource(spark):
  df.show()
  assert df.count() == 3
  assert len(df.columns) == 4
+
+
+def test_kaggle_datasource(spark):
+ spark.dataSource.register(KaggleDataSource)
+ df = spark.read.format("kaggle").options(handle="yasserh/titanic-dataset").load("Titanic-Dataset.csv")
+ df.show()
+ assert df.count() == 891
+ assert len(df.columns) == 12