Skip to content

Commit c6d8411

Browse files
SireInsectusSireInsectus
authored andcommitted
Publishing v2.2.3
1 parent ce16c2d commit c6d8411

File tree

83 files changed

+724
-1130
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+724
-1130
lines changed
File renamed without changes.
Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -142,26 +142,34 @@
142142
# MAGIC The <a href="https://docs.databricks.com/data/databricks-file-system.html" target="_blank">Databricks File System</a> (DBFS) is a virtual file system that allows you to treat cloud object storage as though it were local files and directories on the cluster.
143143
# MAGIC
144144
# MAGIC Run file system commands on DBFS using the magic command: **`%fs`**
145+
# MAGIC
146+
# MAGIC <br/>
147+
# MAGIC <img src="https://files.training.databricks.com/images/icon_hint_24.png"/>
148+
# MAGIC Replace the instances of <strong>FILL_IN</strong> in the cells below with your email address:
149+
150+
# COMMAND ----------
151+
152+
# MAGIC %fs mounts
145153

146154
# COMMAND ----------
147155

148156
# MAGIC %fs ls
149157

150158
# COMMAND ----------
151159

152-
# MAGIC %fs mounts
160+
# MAGIC %fs ls dbfs:/tmp
153161

154162
# COMMAND ----------
155163

156-
# MAGIC %fs ls /databricks-datasets
164+
# MAGIC %fs put dbfs:/tmp/FILL_IN.txt "This is a test of the emergency broadcast system, this is only a test" --overwrite=true
157165

158166
# COMMAND ----------
159167

160-
# MAGIC %fs head /databricks-datasets/README.md
168+
# MAGIC %fs head dbfs:/tmp/FILL_IN.txt
161169

162170
# COMMAND ----------
163171

164-
# MAGIC %fs mounts
172+
# MAGIC %fs ls dbfs:/tmp
165173

166174
# COMMAND ----------
167175

@@ -178,28 +186,41 @@
178186

179187
# COMMAND ----------
180188

181-
dbutils.fs.ls("/databricks-datasets")
189+
dbutils.fs.ls("dbfs:/tmp")
182190

183191
# COMMAND ----------
184192

185193
# MAGIC %md Visualize results in a table using the Databricks <a href="https://docs.databricks.com/notebooks/visualizations/index.html#display-function-1" target="_blank">display</a> function
186194

187195
# COMMAND ----------
188196

189-
files = dbutils.fs.ls("/databricks-datasets")
197+
files = dbutils.fs.ls("dbfs:/tmp")
190198
display(files)
191199

192200
# COMMAND ----------
193201

202+
# MAGIC %md Let's take one more look at our temp file...
203+
204+
# COMMAND ----------
205+
206+
file_name = "dbfs:/tmp/FILL_IN.txt"
207+
contents = dbutils.fs.head(file_name)
208+
209+
print("-"*80)
210+
print(contents)
211+
print("-"*80)
212+
213+
# COMMAND ----------
214+
194215
# MAGIC %md ## Our First Table
195216
# MAGIC
196-
# MAGIC Is located in the path identfied by **`events_path`** (a variable we created for you).
217+
# MAGIC Is located in the path identfied by **`DA.paths.events`** (a variable we created for you).
197218
# MAGIC
198219
# MAGIC We can see those files by running the following cell
199220

200221
# COMMAND ----------
201222

202-
files = dbutils.fs.ls(events_path)
223+
files = dbutils.fs.ls(DA.paths.events)
203224
display(files)
204225

205226
# COMMAND ----------
@@ -213,7 +234,16 @@
213234

214235
# COMMAND ----------
215236

216-
spark.sql(f"SET c.events_path = {events_path}")
237+
spark.conf.set("whatever.events", DA.paths.events)
238+
239+
# COMMAND ----------
240+
241+
# MAGIC %md
242+
# MAGIC <img src="https://files.training.databricks.com/images/icon_note_24.png"> In the above example we use **`whatever.`** to give our variable a "namespace".
243+
# MAGIC
244+
# MAGIC This is so that we don't accidently step over other configuration parameters.
245+
# MAGIC
246+
# MAGIC You will see throughout this course our usage of the "DA" namesapce as in **`DA.paths.some_file`**
217247

218248
# COMMAND ----------
219249

@@ -225,15 +255,26 @@
225255
# MAGIC %sql
226256
# MAGIC CREATE TABLE IF NOT EXISTS events
227257
# MAGIC USING DELTA
228-
# MAGIC OPTIONS (path = "${c.events_path}");
258+
# MAGIC OPTIONS (path = "${whatever.events}");
259+
260+
# COMMAND ----------
261+
262+
# MAGIC %md This table was saved in the database created for you in classroom setup.
263+
# MAGIC
264+
# MAGIC See database name printed below.
229265

230266
# COMMAND ----------
231267

232-
# MAGIC %md This table was saved in the database created for you in the classroom setup. See the database name printed below.
268+
print(f"Database Name: {DA.db_name}")
233269

234270
# COMMAND ----------
235271

236-
print(database_name)
272+
# MAGIC %md ... or even the tables in that database:
273+
274+
# COMMAND ----------
275+
276+
# MAGIC %sql
277+
# MAGIC SHOW TABLES IN ${DA.db_name}
237278

238279
# COMMAND ----------
239280

@@ -299,7 +340,7 @@
299340
# COMMAND ----------
300341

301342
dbutils.widgets.text("name", "Brickster", "Name")
302-
dbutils.widgets.multiselect("colors", "orange", ["red", "orange", "black", "blue"], "Traffic Sources")
343+
dbutils.widgets.multiselect("colors", "orange", ["red", "orange", "black", "blue"], "Favorite Color?")
303344

304345
# COMMAND ----------
305346

@@ -331,7 +372,7 @@
331372

332373
# COMMAND ----------
333374

334-
classroom_cleanup()
375+
DA.cleanup()
335376

336377
# COMMAND ----------
337378

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929
# COMMAND ----------
3030

3131
# MAGIC %md ### 1. List data files in DBFS using magic commands
32-
# MAGIC Use a magic command to display files located in the DBFS directory: **`dbfs:/databricks-datasets`**
32+
# MAGIC Use a magic command to display files located in the DBFS directory: **`dbfs:/user`**
3333
# MAGIC
34-
# MAGIC <img src="https://files.training.databricks.com/images/icon_hint_32.png" alt="Hint"> You should see several datasets that come pre-installed in Databricks such as: **`COVID`**, **`adult`**, and **`airlines`**.
34+
# MAGIC <img src="https://files.training.databricks.com/images/icon_hint_32.png" alt="Hint"> You should see several user directories including your own. Depending on your permissions, you may see only your user directory.
3535

3636
# COMMAND ----------
3737

@@ -41,10 +41,10 @@
4141
# COMMAND ----------
4242

4343
# MAGIC %md ### 2. List data files in DBFS using dbutils
44-
# MAGIC - Use **`dbutils`** to get the files at the directory above and save it to the variable **`files`**
44+
# MAGIC - Use **`dbutils`** to get the files at the directory above and assign it to the variable **`files`**
4545
# MAGIC - Use the Databricks display() function to display the contents in **`files`**
4646
# MAGIC
47-
# MAGIC <img src="https://files.training.databricks.com/images/icon_hint_32.png" alt="Hint"> You should see several datasets that come pre-installed in Databricks such as: **`COVID`**, **`adult`**, and **`airlines`**.
47+
# MAGIC <img src="https://files.training.databricks.com/images/icon_hint_32.png" alt="Hint"> Just as before, you should see several user directories including your own.
4848

4949
# COMMAND ----------
5050

@@ -55,10 +55,10 @@
5555
# COMMAND ----------
5656

5757
# MAGIC %md ### 3. Create tables below from files in DBFS
58-
# MAGIC - Create the **`users`** table using the spark-context variable **`c.users_path`**
59-
# MAGIC - Create the **`sales`** table using the spark-context variable **`c.sales_path`**
60-
# MAGIC - Create the **`products`** table using the spark-context variable **`c.products_path`**
61-
# MAGIC - Create the **`events`** table using the spark-context variable **`c.events_path`**
58+
# MAGIC - Create the **`users`** table using the spark-context variable **`DA.paths.users`**
59+
# MAGIC - Create the **`sales`** table using the spark-context variable **`DA.paths.sales`**
60+
# MAGIC - Create the **`products`** table using the spark-context variable **`DA.paths.products`**
61+
# MAGIC - Create the **`events`** table using the spark-context variable **`DA.paths.events`**
6262
# MAGIC
6363
# MAGIC <img src="https://files.training.databricks.com/images/icon_hint_32.png"> Hint: We created the **`events`** table in the previous notebook but in a different database.
6464

@@ -164,7 +164,7 @@
164164

165165
# COMMAND ----------
166166

167-
classroom_cleanup()
167+
DA.cleanup()
168168

169169
# COMMAND ----------
170170

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
# MAGIC 1. Convert between DataFrames and SQL
2121
# MAGIC
2222
# MAGIC ##### Methods
23-
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#spark-session-apis" target="_blank">SparkSession</a>: **`sql`**, **`table`**
24-
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a>:
23+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/spark_session.html" target="_blank">SparkSession</a>: **`sql`**, **`table`**
24+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html" target="_blank">DataFrame</a>:
2525
# MAGIC - Transformations: **`select`**, **`where`**, **`orderBy`**
2626
# MAGIC - Actions: **`show`**, **`count`**, **`take`**
2727
# MAGIC - Other methods: **`printSchema`**, **`schema`**, **`createOrReplaceTempView`**
@@ -282,7 +282,7 @@
282282

283283
# COMMAND ----------
284284

285-
classroom_cleanup()
285+
DA.cleanup()
286286

287287
# COMMAND ----------
288288

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
# MAGIC 1. Create the same DataFrame using a SQL query
1919
# MAGIC
2020
# MAGIC ##### Methods
21-
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.SparkSession.html?highlight=sparksession" target="_blank">SparkSession</a>: **`sql`**, **`table`**
22-
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a> transformations: **`select`**, **`where`**, **`orderBy`**
21+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/spark_session.html" target="_blank">SparkSession</a>: **`sql`**, **`table`**
22+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html" target="_blank">DataFrame</a> transformations: **`select`**, **`where`**, **`orderBy`**
2323
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a> actions: **`select`**, **`count`**, **`take`**
24-
# MAGIC - Other <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a> methods: **`printSchema`**, **`schema`**, **`createOrReplaceTempView`**
24+
# MAGIC - Other <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html" target="_blank">DataFrame</a> methods: **`printSchema`**, **`schema`**, **`createOrReplaceTempView`**
2525

2626
# COMMAND ----------
2727

@@ -118,7 +118,7 @@
118118

119119
# COMMAND ----------
120120

121-
classroom_cleanup()
121+
DA.cleanup()
122122

123123
# COMMAND ----------
124124

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
# COMMAND ----------
4747

48-
users_csv_path = f"{datasets_dir}/users/users-500k.csv"
48+
users_csv_path = f"{DA.paths.datasets}/ecommerce/users/users-500k.csv"
4949

5050
users_df = (spark
5151
.read
@@ -123,7 +123,7 @@
123123

124124
# COMMAND ----------
125125

126-
events_json_path = f"{datasets_dir}/events/events-500k.json"
126+
events_json_path = f"{DA.paths.datasets}/ecommerce/events/events-500k.json"
127127

128128
events_df = (spark
129129
.read
@@ -187,7 +187,7 @@
187187
# COMMAND ----------
188188

189189
# Step 1 - use this trick to transfer a value (the dataset path) between Python and Scala using the shared spark-config
190-
spark.conf.set("com.whatever.your_scope.events_path", events_json_path)
190+
spark.conf.set("whatever_your_scope.events", events_json_path)
191191

192192
# COMMAND ----------
193193

@@ -198,7 +198,7 @@
198198

199199
# MAGIC %scala
200200
# MAGIC // Step 2 - pull the value from the config (or copy & paste it)
201-
# MAGIC val eventsJsonPath = spark.conf.get("com.whatever.your_scope.events_path")
201+
# MAGIC val eventsJsonPath = spark.conf.get("whatever_your_scope.events")
202202
# MAGIC
203203
# MAGIC // Step 3 - Read in the JSON, but let it infer the schema
204204
# MAGIC val eventsSchema = spark.read
@@ -260,7 +260,7 @@
260260

261261
# COMMAND ----------
262262

263-
users_output_dir = working_dir + "/users.parquet"
263+
users_output_dir = f"{DA.paths.working_dir}/users.parquet"
264264

265265
(users_df
266266
.write
@@ -301,11 +301,22 @@
301301

302302
# COMMAND ----------
303303

304-
# MAGIC %md This table was saved in the database created for you in classroom setup. See database name printed below.
304+
# MAGIC %md This table was saved in the database created for you in classroom setup.
305+
# MAGIC
306+
# MAGIC See database name printed below.
307+
308+
# COMMAND ----------
309+
310+
print(f"Database Name: {DA.db_name}")
311+
312+
# COMMAND ----------
313+
314+
# MAGIC %md ... or even the tables in that database:
305315

306316
# COMMAND ----------
307317

308-
print(database_name)
318+
# MAGIC %sql
319+
# MAGIC SHOW TABLES IN ${DA.db_name}
309320

310321
# COMMAND ----------
311322

@@ -337,7 +348,7 @@
337348

338349
# COMMAND ----------
339350

340-
events_output_path = working_dir + "/delta/events"
351+
events_output_path = f"{DA.paths.working_dir}/delta/events"
341352

342353
(events_df
343354
.write
@@ -352,7 +363,7 @@
352363

353364
# COMMAND ----------
354365

355-
classroom_cleanup()
366+
DA.cleanup()
356367

357368
# COMMAND ----------
358369

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,17 @@
2525
# COMMAND ----------
2626

2727
# MAGIC %md ### 1. Read with infer schema
28-
# MAGIC - View the first CSV file using DBUtils method **`fs.head`** with the filepath provided in the variable **`single_product_cs_fil_path`**
28+
# MAGIC - View the first CSV file using DBUtils method **`fs.head`** with the filepath provided in the variable **`single_product_cs_file_path`**
2929
# MAGIC - Create **`products_df`** by reading from CSV files located in the filepath provided in the variable **`products_csv_path`**
3030
# MAGIC - Configure options to use first line as header and infer schema
3131

3232
# COMMAND ----------
3333

3434
# TODO
35-
single_product_csv_file_path = f"{datasets_dir}/products/products.csv/part-00000-tid-1663954264736839188-daf30e86-5967-4173-b9ae-d1481d3506db-2367-1-c000.csv"
35+
single_product_csv_file_path = f"{DA.paths.datasets}/products/products.csv/part-00000-tid-1663954264736839188-daf30e86-5967-4173-b9ae-d1481d3506db-2367-1-c000.csv"
3636
print(FILL_IN)
3737

38-
products_csv_path = f"{datasets_dir}/products/products.csv"
38+
products_csv_path = f"{DA.paths.datasets}/products/products.csv"
3939
products_df = FILL_IN
4040

4141
products_df.printSchema()
@@ -108,7 +108,7 @@
108108
# COMMAND ----------
109109

110110
# TODO
111-
products_output_path = working_dir + "/delta/products"
111+
products_output_path = f"{DA.paths.working_dir}/delta/products"
112112
products_df.FILL_IN
113113

114114
# COMMAND ----------
@@ -137,7 +137,7 @@
137137

138138
# COMMAND ----------
139139

140-
classroom_cleanup()
140+
DA.cleanup()
141141

142142
# COMMAND ----------
143143

0 commit comments

Comments
 (0)