allisonwang-db
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/datasources/simplejson.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/datasources/simplejson.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/index.md‎
Lines changed: 7 additions & 6 deletions b/‎docs/index.md‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions b/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎poetry.lock‎
Lines changed: 95 additions & 1 deletion b/‎poetry.lock‎
Lines changed: 95 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎pyspark_datasources/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎pyspark_datasources/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyspark_datasources/simplejson.py‎
Lines changed: 117 additions & 0 deletions b/‎pyspark_datasources/simplejson.py‎
Lines changed: 117 additions & 0 deletions
@@ -18,9 +18,11 @@ pip install pyspark-data-sources[all]
 Install the pyspark 4.0 preview version: https://pypi.org/project/pyspark/4.0.0.dev1/
 
 ```
-pip install pyspark==4.0.0.dev1
+pip install "pyspark[connect]==4.0.0.dev1"
 ```
 
+Or use Databricks Runtime 15.2 or above.
+
 Try the data sources!
 
 ```python
 
@@ -0,0 +1,3 @@
+# SimpleJsonDataSource
+
+::: pyspark_datasources.simplejson.SimpleJsonDataSource
@@ -28,9 +28,10 @@ spark.read.format("github").load("apache/spark").show()
 
 ## Data Sources
 
-| Data Source | Short Name | Description | Dependencies |
-|-----------------------------------------------------|---------------|---------------------------------------------|----------------|
-| [GithubDataSource](./datasources/github.md) | `github` | Read pull requests from a Github repository | None |
-| [FakeDataSource](./datasources/fake.md) | `fake` | Generate fake data using the `Faker` library | `faker` |
-| [HuggingFaceDatasets](./datasources/huggingface.md) | `huggingface` | Read datasets from the HuggingFace Hub | `datasets` |
-| [StockDataSource](./datasources/stock.md) | `stock` | Read stock data from Alpha Vantage | None |
+| Data Source | Short Name | Description | Dependencies |
+|-----------------------------------------------------|---------------|---------------------------------------------|-----------------|
+| [GithubDataSource](./datasources/github.md) | `github` | Read pull requests from a Github repository | None |
+| [FakeDataSource](./datasources/fake.md) | `fake` | Generate fake data using the `Faker` library | `faker` |
+| [HuggingFaceDatasets](./datasources/huggingface.md) | `huggingface` | Read datasets from the HuggingFace Hub | `datasets` |
+| [StockDataSource](./datasources/stock.md) | `stock` | Read stock data from Alpha Vantage | None |
+| [SimpleJsonDataSource](./datasources/simplejson.md) | `simplejson` | Read JSON data from a file | `databricks-sdk` |
@@ -22,6 +22,7 @@ nav:
  - datasources/fake.md
  - datasources/huggingface.md
  - datasources/stock.md
+ - datasources/simplejson.md
 
 markdown_extensions:
  - pymdownx.highlight:
 
@@ -15,11 +15,13 @@ requests = "^2.31.0"
 faker = {version = "^23.1.0", optional = true}
 mkdocstrings = {extras = ["python"], version = "^0.24.0"}
 datasets = {version = "^2.17.0", optional = true}
+databricks-sdk = {version = "^0.28.0", optional = true}
 
 [tool.poetry.extras]
 faker = ["faker"]
 datasets = ["datasets"]
-all = ["faker", "datasets"]
+databricks = ["databricks-sdk"]
+all = ["faker", "datasets", "databricks"]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.0.0"
 
@@ -2,3 +2,4 @@
 from .github import GithubDataSource
 from .huggingface import HuggingFaceDatasets
 from .stock import StockDataSource
+from .simplejson import SimpleJsonDataSource
@@ -0,0 +1,117 @@
+import io
+import json
+import time
+
+from dataclasses import dataclass
+from typing import Dict, List
+
+from pyspark.sql.types import StructType
+from pyspark.sql.datasource import DataSource, DataSourceWriter, WriterCommitMessage
+
+
+class SimpleJsonDataSource(DataSource):
+ """
+ A simple json writer for writing data to Databricks DBFS.
+
+ Examples
+ --------
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.range(0, 10, 1, 2).withColumn("value", sf.expr("concat('value_', id)"))
+
+ Register the data source.
+
+ >>> from pyspark_datasources import SimpleJsonDataSource
+ >>> spark.dataSource.register(SimpleJsonDataSource)
+
+ Append the DataFrame to a DBFS path as json files.
+
+ >>> (
+ ... df.write.format("simplejson")
+ ... .mode("append")
+ ... .option("databricks_url", "https://your-databricks-instance.cloud.databricks.com")
+ ... .option("databricks_token", "your-token")
+ ... .save("/path/to/output")
+ ... )
+
+ Overwrite the DataFrame to a DBFS path as json files.
+
+ >>> (
+ ... df.write.format("simplejson")
+ ... .mode("overwrite")
+ ... .option("databricks_url", "https://your-databricks-instance.cloud.databricks.com")
+ ... .option("databricks_token", "your-token")
+ ... .save("/path/to/output")
+ ... )
+ """
+ @classmethod
+ def name(self) -> str:
+ return "simplejson"
+
+ def writer(self, schema: StructType, overwrite: bool):
+ return SimpleJsonWriter(schema, self.options, overwrite)
+
+
+@dataclass
+class CommitMessage(WriterCommitMessage):
+ output_path: str
+
+
+class SimpleJsonWriter(DataSourceWriter):
+ def __init__(self, schema: StructType, options: Dict, overwrite: bool):
+ self.overwrite = overwrite
+ self.databricks_url = options.get("databricks_url")
+ self.databricks_token = options.get("databricks_token")
+ if not self.databricks_url or not self.databricks_token:
+ raise Exception("Databricks URL and token must be specified")
+ self.path = options.get("path")
+ if not self.path:
+ raise Exception("You must specify an output path")
+
+ def write(self, iterator):
+ # Important: Always import non-serializable libraries inside the `write` method.
+ from pyspark import TaskContext
+ from databricks.sdk import WorkspaceClient
+
+ # Consume all input rows and dump them as json.
+ rows = [row.asDict() for row in iterator]
+ json_data = json.dumps(rows)
+ f = io.BytesIO(json_data.encode('utf-8'))
+
+ context = TaskContext.get()
+ id = context.taskAttemptId()
+ file_path = f"{self.path}/{id}_{time.time_ns()}.json"
+
+ # Upload to DFBS.
+ w = WorkspaceClient(host=self.databricks_url, token=self.databricks_token)
+ w.dbfs.upload(file_path, f)
+
+ return CommitMessage(output_path=file_path)
+
+ def commit(self, messages: List[CommitMessage]):
+ from databricks.sdk import WorkspaceClient
+
+ w = WorkspaceClient(host=self.databricks_url, token=self.databricks_token)
+ paths = [message.output_path for message in messages]
+
+ if self.overwrite:
+ # Remove all files in the current directory except for the newly written files.
+ for file in w.dbfs.list(self.path):
+ if file.path not in paths:
+ print(f"[Overwrite] Removing file {file.path}")
+ w.dbfs.delete(file.path)
+
+ # Write a success file
+ file_path = f"{self.path}/_SUCCESS"
+ f = io.BytesIO(b"success")
+ w.dbfs.upload(file_path, f, overwrite=True)
+
+ def abort(self, messages: List[CommitMessage]):
+ from databricks.sdk import WorkspaceClient
+
+ w = WorkspaceClient(host=self.databricks_url, token=self.databricks_token)
+ # Clean up the newly written files
+ for message in messages:
+ if message is not None:
+ print(f"[Abort] Removing up partially written files: {message.output_path}")
+ w.dbfs.delete(message.output_path)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# SimpleJsonDataSource`
	`2`	`+`
	`3`	`+::: pyspark_datasources.simplejson.SimpleJsonDataSource`