Skip to content

Commit 314c715

Browse files
SireInsectusSireInsectus
authored andcommitted
Publishing v2.0.1
1 parent c317da0 commit 314c715

File tree

59 files changed

+586
-602
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+586
-602
lines changed

Apache-Spark-Programming-with-Databricks/ASP 0 - Course Agenda.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,11 @@
2222
# MAGIC
2323
# MAGIC ## Spark Core
2424
# MAGIC * [ASP 2.1 - Spark SQL]($./ASP 2 - Spark Core/ASP 2.1 - Spark SQL)
25+
# MAGIC * [ASP 2.1L - Spark SQL Lab]($./ASP 2 - Spark Core/Labs/ASP 2.1L - Spark SQL Lab)
2526
# MAGIC * [ASP 2.2 - Reader & Writer]($./ASP 2 - Spark Core/ASP 2.2 - Reader & Writer)
26-
# MAGIC * [ASP 2.2L - Spark SQL Lab]($./ASP 2 - Spark Core/Labs/ASP 2.2L - Spark SQL Lab)
27+
# MAGIC * [ASP 2.2L - Ingesting Data Lab]($./ASP 2 - Spark Core/Labs/ASP 2.2L - Ingesting Data Lab)
2728
# MAGIC * [ASP 2.3 - DataFrame & Column]($./ASP 2 - Spark Core/ASP 2.3 - DataFrame & Column)
28-
# MAGIC * [ASP 2.3L - Ingesting Data Lab]($./ASP 2 - Spark Core/Labs/ASP 2.3L - Ingesting Data Lab)
29-
# MAGIC * [ASP 2.4L - Purchase Revenues Lab]($./ASP 2 - Spark Core/Labs/ASP 2.4L - Purchase Revenues Lab)
29+
# MAGIC * [ASP 2.3L - Purchase Revenues Lab]($./ASP 2 - Spark Core/Labs/ASP 2.3L - Purchase Revenues Lab)
3030

3131
# COMMAND ----------
3232

Apache-Spark-Programming-with-Databricks/ASP 1 - Introductions/ASP 1.1 - Databricks Platform.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
# COMMAND ----------
3131

3232
# MAGIC %md ### Setup
33-
# MAGIC Run classroom setup to mount Databricks training datasets and create your own database for BedBricks.
33+
# MAGIC Run classroom setup to [mount](https://docs.databricks.com/data/databricks-file-system.html#mount-storage) Databricks training datasets and create your own database for BedBricks.
3434
# MAGIC
3535
# MAGIC Use the **`%run`** magic command to run another notebook within a notebook
3636

@@ -189,7 +189,7 @@
189189

190190
# MAGIC %md ## Our First Table
191191
# MAGIC
192-
# MAGIC Is located in the path identfied by **`eventsPath`** (a variable we created for you).
192+
# MAGIC Is located in the path identfied by **`events_path`** (a variable we created for you).
193193
# MAGIC
194194
# MAGIC We can see those files by running the following cell
195195

@@ -219,7 +219,7 @@
219219
# COMMAND ----------
220220

221221
# MAGIC %sql
222-
# MAGIC CREATE TABLE IF NOT EXISTS events USING parquet OPTIONS (path "${c.events_path}");
222+
# MAGIC CREATE TABLE IF NOT EXISTS events USING delta OPTIONS (path "${c.events_path}");
223223

224224
# COMMAND ----------
225225

Apache-Spark-Programming-with-Databricks/ASP 1 - Introductions/Labs/ASP 1.1L - Explore Datasets Lab.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@
8484
# COMMAND ----------
8585

8686
# MAGIC %md
87-
# MAGIC #### Q1: What products are available for purchase at BedBricks?
87+
# MAGIC #### 4.1: What products are available for purchase at BedBricks?
8888
# MAGIC
8989
# MAGIC The **`products`** dataset contains the ID, name, and price of products on the BedBricks retail site.
9090
# MAGIC
@@ -105,7 +105,7 @@
105105

106106
# COMMAND ----------
107107

108-
# MAGIC %md #### Q2: What is the average purchase revenue for a transaction at BedBricks?
108+
# MAGIC %md #### 4.2: What is the average purchase revenue for a transaction at BedBricks?
109109
# MAGIC
110110
# MAGIC The **`sales`** dataset contains order information representing successfully processed sales.
111111
# MAGIC Most fields correspond directly with fields from the clickstream data associated with a sale finalization event.
@@ -131,7 +131,7 @@
131131

132132
# COMMAND ----------
133133

134-
# MAGIC %md #### Q3: What types of events are recorded on the BedBricks website?
134+
# MAGIC %md #### 4.3: What types of events are recorded on the BedBricks website?
135135
# MAGIC
136136
# MAGIC The **`events`** dataset contains two weeks worth of parsed JSON records, created by consuming updates to an operational database.
137137
# MAGIC Records are received whenever: (1) a new user visits the site, (2) a user provides their email for the first time.

Apache-Spark-Programming-with-Databricks/ASP 2 - Spark Core/ASP 2.1 - Spark SQL.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,12 @@
6363

6464
# COMMAND ----------
6565

66-
display(spark.table("products")
67-
.select("name", "price")
68-
.where("price < 200")
69-
.orderBy("price"))
66+
display(spark
67+
.table("products")
68+
.select("name", "price")
69+
.where("price < 200")
70+
.orderBy("price")
71+
)
7072

7173
# COMMAND ----------
7274

@@ -153,10 +155,12 @@
153155

154156
# COMMAND ----------
155157

156-
budget_df = (spark.table("products")
157-
.select("name", "price")
158-
.where("price < 200")
159-
.orderBy("price"))
158+
budget_df = (spark
159+
.table("products")
160+
.select("name", "price")
161+
.where("price < 200")
162+
.orderBy("price")
163+
)
160164

161165
# COMMAND ----------
162166

Apache-Spark-Programming-with-Databricks/ASP 2 - Spark Core/ASP 2.2 - Reader & Writer.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@
245245
# MAGIC (df.write
246246
# MAGIC .option("compression", "snappy")
247247
# MAGIC .mode("overwrite")
248-
# MAGIC .parquet(outPath)
248+
# MAGIC .parquet(output_dir)
249249
# MAGIC )
250250
# MAGIC ```
251251
# MAGIC
@@ -261,19 +261,19 @@
261261

262262
# COMMAND ----------
263263

264-
users_output_path = working_dir + "/users.parquet"
264+
users_output_dir = working_dir + "/users.parquet"
265265

266266
(users_df
267267
.write
268268
.option("compression", "snappy")
269269
.mode("overwrite")
270-
.parquet(users_output_path)
270+
.parquet(users_output_dir)
271271
)
272272

273273
# COMMAND ----------
274274

275275
display(
276-
dbutils.fs.ls(users_output_path)
276+
dbutils.fs.ls(users_output_dir)
277277
)
278278

279279
# COMMAND ----------
@@ -285,7 +285,7 @@
285285

286286
(users_df
287287
.write
288-
.parquet(users_output_path, compression="snappy", mode="overwrite")
288+
.parquet(users_output_dir, compression="snappy", mode="overwrite")
289289
)
290290

291291
# COMMAND ----------
@@ -298,7 +298,7 @@
298298

299299
# COMMAND ----------
300300

301-
events_df.write.mode("overwrite").saveAsTable("events_p")
301+
events_df.write.mode("overwrite").saveAsTable("events")
302302

303303
# COMMAND ----------
304304

@@ -334,7 +334,7 @@
334334
# MAGIC %md
335335
# MAGIC ### Write Results to a Delta Table
336336
# MAGIC
337-
# MAGIC Write **`events_df`** with the DataFrameWriter's **`save`** method and the following configurations: Delta format, overwrite mode
337+
# MAGIC Write **`events_df`** with the DataFrameWriter's **`save`** method and the following configurations: Delta format & overwrite mode.
338338

339339
# COMMAND ----------
340340

Apache-Spark-Programming-with-Databricks/ASP 2 - Spark Core/ASP 2.3 - DataFrame & Column.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
# COMMAND ----------
3232

33-
events_df = spark.read.parquet(events_path)
33+
events_df = spark.read.format("delta").load(events_path)
3434
display(events_df)
3535

3636
# COMMAND ----------
@@ -45,9 +45,9 @@
4545

4646
from pyspark.sql.functions import col
4747

48-
events_df.device
49-
events_df["device"]
50-
col("device")
48+
print(events_df.device)
49+
print(events_df["device"])
50+
print(col("device"))
5151

5252
# COMMAND ----------
5353

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@
5858

5959
# TODO
6060
mac_df = (events_df
61-
.FILL_IN
62-
)
61+
.FILL_IN
62+
)
6363

6464
# COMMAND ----------
6565

@@ -74,7 +74,7 @@
7474

7575
# COMMAND ----------
7676

77-
# MAGIC %md **CHECK YOUR WORK**
77+
# MAGIC %md **4.1: CHECK YOUR WORK**
7878

7979
# COMMAND ----------
8080

@@ -99,7 +99,7 @@
9999

100100
# COMMAND ----------
101101

102-
# MAGIC %md **CHECK YOUR WORK**
102+
# MAGIC %md **5.1: CHECK YOUR WORK**
103103
# MAGIC - You should only see **`macOS`** values in the **`device`** column
104104
# MAGIC - The fifth row should be an event with timestamp **`1592539226602157`**
105105

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,11 @@
3838
products_csv_path = f"{datasets_dir}/products/products.csv"
3939
products_df = FILL_IN
4040

41-
productsDF.printSchema()
41+
products_df.printSchema()
4242

4343
# COMMAND ----------
4444

45-
# MAGIC %md **CHECK YOUR WORK**
45+
# MAGIC %md **1.1: CHECK YOUR WORK**
4646

4747
# COMMAND ----------
4848

@@ -62,7 +62,7 @@
6262

6363
# COMMAND ----------
6464

65-
# MAGIC %md **CHECK YOUR WORK**
65+
# MAGIC %md **2.1: CHECK YOUR WORK**
6666

6767
# COMMAND ----------
6868

@@ -90,7 +90,7 @@
9090

9191
# COMMAND ----------
9292

93-
# MAGIC %md **CHECK YOUR WORK**
93+
# MAGIC %md **3.1: CHECK YOUR WORK**
9494

9595
# COMMAND ----------
9696

@@ -99,7 +99,7 @@
9999
# COMMAND ----------
100100

101101
# MAGIC %md ### 4. Write to Delta
102-
# MAGIC Write **`productsDF`** to the filepath provided in the variable **`productsOutputPath`**
102+
# MAGIC Write **`products_df`** to the filepath provided in the variable **`products_output_path`**
103103

104104
# COMMAND ----------
105105

@@ -109,7 +109,7 @@
109109

110110
# COMMAND ----------
111111

112-
# MAGIC %md **CHECK YOUR WORK**
112+
# MAGIC %md **4.1: CHECK YOUR WORK**
113113

114114
# COMMAND ----------
115115

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
# COMMAND ----------
3030

31-
events_df = spark.read.parquet(events_path)
31+
events_df = spark.read.format("delta").load(events_path)
3232
display(events_df)
3333

3434
# COMMAND ----------
@@ -44,7 +44,7 @@
4444

4545
# COMMAND ----------
4646

47-
# MAGIC %md **CHECK YOUR WORK**
47+
# MAGIC %md **1.1: CHECK YOUR WORK**
4848

4949
# COMMAND ----------
5050

@@ -66,7 +66,7 @@
6666

6767
# COMMAND ----------
6868

69-
# MAGIC %md **CHECK YOUR WORK**
69+
# MAGIC %md **2.1: CHECK YOUR WORK**
7070

7171
# COMMAND ----------
7272

@@ -91,7 +91,7 @@
9191

9292
# MAGIC %md
9393
# MAGIC ### 4. Drop unneeded column
94-
# MAGIC Since there's only one event type, drop **`event_name`** from **`purchasesDF`**.
94+
# MAGIC Since there's only one event type, drop **`event_name`** from **`purchases_df`**.
9595

9696
# COMMAND ----------
9797

@@ -101,13 +101,13 @@
101101

102102
# COMMAND ----------
103103

104-
# MAGIC %md **CHECK YOUR WORK**
104+
# MAGIC %md **4.1: CHECK YOUR WORK**
105105

106106
# COMMAND ----------
107107

108108
expected_columns = {"device", "ecommerce", "event_previous_timestamp", "event_timestamp",
109109
"geo", "items", "revenue", "traffic_source",
110-
"user_first_touch_timestamp", "user_id"}
110+
"user_first_touch_timestamp", "user_id"}
111111
assert(set(final_df.columns) == expected_columns)
112112

113113
# COMMAND ----------
@@ -125,7 +125,7 @@
125125

126126
# COMMAND ----------
127127

128-
# MAGIC %md **CHECK YOUR WORK**
128+
# MAGIC %md **5.1: CHECK YOUR WORK**
129129

130130
# COMMAND ----------
131131

Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.1 - Aggregation.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
# COMMAND ----------
3232

33-
df = spark.read.parquet(events_path)
33+
df = spark.read.format("delta").load(events_path)
3434
display(df)
3535

3636
# COMMAND ----------
@@ -140,10 +140,10 @@
140140
from pyspark.sql.functions import avg, approx_count_distinct
141141

142142
state_aggregates_df = (df
143-
.groupBy("geo.state")
144-
.agg(avg("ecommerce.total_item_quantity").alias("avg_quantity"),
145-
approx_count_distinct("user_id").alias("distinct_users"))
146-
)
143+
.groupBy("geo.state")
144+
.agg(avg("ecommerce.total_item_quantity").alias("avg_quantity"),
145+
approx_count_distinct("user_id").alias("distinct_users"))
146+
)
147147

148148
display(state_aggregates_df)
149149

@@ -164,11 +164,10 @@
164164

165165
from pyspark.sql.functions import cos, sqrt
166166

167-
display(
168-
spark.range(10) # Create a DataFrame with a single column called "id" with a range of integer values
169-
.withColumn("sqrt", sqrt("id"))
170-
.withColumn("cos", cos("id"))
171-
)
167+
display(spark.range(10) # Create a DataFrame with a single column called "id" with a range of integer values
168+
.withColumn("sqrt", sqrt("id"))
169+
.withColumn("cos", cos("id"))
170+
)
172171

173172
# COMMAND ----------
174173

0 commit comments

Comments
 (0)