Skip to content

Commit 451a55c

Browse files
SireInsectusSireInsectus
authored andcommitted
Publishing v2.2.0
1 parent fb0019f commit 451a55c

File tree

11 files changed

+196
-50
lines changed

11 files changed

+196
-50
lines changed

Apache-Spark-Programming-with-Databricks/ASP 2 - Spark Core/ASP 2.3L - Purchase Revenues Lab.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848

4949
# COMMAND ----------
5050

51+
from pyspark.sql.functions import col
5152
expected1 = [5830.0, 5485.0, 5289.0, 5219.1, 5180.0, 5175.0, 5125.0, 5030.0, 4985.0, 4985.0]
5253
result1 = [row.revenue for row in revenue_df.sort(col("revenue").desc_nulls_last()).limit(10).collect()]
5354

Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.3 - Complex Types.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# MAGIC 1. Union DataFrames together
1818
# MAGIC
1919
# MAGIC ##### Methods
20-
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a>: **`unionByName`**
20+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a>:**`union`**, **`unionByName`**
2121
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html?#functions" target="_blank">Built-In Functions</a>:
2222
# MAGIC - Aggregate: **`collect_set`**
2323
# MAGIC - Collection: **`array_contains`**, **`element_at`**, **`explode`**
@@ -117,6 +117,33 @@
117117

118118
# COMMAND ----------
119119

120+
# MAGIC %md
121+
# MAGIC
122+
# MAGIC ##Union and unionByName
123+
# MAGIC <img src="https://files.training.databricks.com/images/icon_warn_32.png" alt="Warning"> The DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.union.html" target="_blank">**`union`**</a> method resolves columns by position, as in standard SQL. You should use it only if the two DataFrames have exactly the same schema, including the column order. In contrast, the DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.unionByName.html" target="_blank">**`unionByName`**</a> method resolves columns by name. This is equivalent to UNION ALL in SQL. Neither one will remove duplicates.
124+
# MAGIC
125+
# MAGIC Below is a check to see if the two dataframes have a matching schema where **`union`** would be appropriate
126+
127+
# COMMAND ----------
128+
129+
mattress_df.schema==size_df.schema
130+
131+
# COMMAND ----------
132+
133+
# MAGIC %md
134+
# MAGIC If we do get the two schemas to match with a simple **`select`** statement, then we can use a **`union`**
135+
136+
# COMMAND ----------
137+
138+
union_count = mattress_df.select("email").union(size_df.select("email")).count()
139+
140+
mattress_count = mattress_df.count()
141+
size_count = size_df.count()
142+
143+
mattress_count + size_count == union_count
144+
145+
# COMMAND ----------
146+
120147
# MAGIC %md ### Clean up classroom
121148
# MAGIC
122149
# MAGIC And lastly, we'll clean up the classroom.

Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.3L - Users.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,6 @@
100100
# MAGIC - Drop the **`details`** column
101101
# MAGIC
102102
# MAGIC Save the result as **`union_df`**.
103-
# MAGIC
104-
# MAGIC <img src="https://files.training.databricks.com/images/icon_warn_32.png" alt="Warning"> The DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.union.html" target="_blank">**`union`**</a> method resolves columns by position, as in standard SQL. You should use it only if the two DataFrames have exactly the same schema, including the column order. In contrast, the DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.unionByName.html" target="_blank">**`unionByName`**</a> method resolves columns by name.
105103

106104
# COMMAND ----------
107105

Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.4 - Additional Functions.py

Lines changed: 68 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
# MAGIC 1. Join DataFrames
1717
# MAGIC
1818
# MAGIC ##### Methods
19-
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a>: **`fill`**
19+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.join.html?highlight=dataframe%20join#pyspark.sql.DataFrame.join" target="_blank">DataFrame Methods </a>: **`join`**
20+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a>: **`fill`**, **`drop`**
2021
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html?#functions" target="_blank">Built-In Functions</a>:
2122
# MAGIC - Aggregate: **`collect_set`**
2223
# MAGIC - Collection: **`explode`**
@@ -32,51 +33,79 @@
3233

3334
# COMMAND ----------
3435

35-
# MAGIC %md ### DataFrameNaFunctions
36-
# MAGIC <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a> is a DataFrame submodule with methods for handling null values. Obtain an instance of DataFrameNaFunctions by accessing the **`na`** attribute of a DataFrame.
36+
sales_df = spark.read.format("delta").load(sales_path)
37+
display(sales_df)
38+
39+
# COMMAND ----------
40+
41+
# MAGIC %md ### Non-aggregate and Miscellaneous Functions
42+
# MAGIC Here are a few additional non-aggregate and miscellaneous built-in functions.
3743
# MAGIC
3844
# MAGIC | Method | Description |
3945
# MAGIC | --- | --- |
40-
# MAGIC | drop | Returns a new DataFrame omitting rows with any, all, or a specified number of null values, considering an optional subset of columns |
41-
# MAGIC | fill | Replace null values with the specified value for an optional subset of columns |
42-
# MAGIC | replace | Returns a new DataFrame replacing a value with another value, considering an optional subset of columns |
46+
# MAGIC | col / column | Returns a Column based on the given column name. |
47+
# MAGIC | lit | Creates a Column of literal value |
48+
# MAGIC | isnull | Return true iff the column is null |
49+
# MAGIC | rand | Generate a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0) |
4350

4451
# COMMAND ----------
4552

46-
sales_df = spark.read.format("delta").load(sales_path)
47-
display(sales_df)
53+
# MAGIC %md We could select a particular column using the **`col`** function
4854

4955
# COMMAND ----------
5056

51-
# MAGIC %md Let's say we need to remove the email addresses from our dataset.
57+
gmail_accounts = sales_df.filter(col("email").contains("gmail"))
58+
59+
display(gmail_accounts)
5260

5361
# COMMAND ----------
5462

55-
no_pii_df = sales_df.drop("email")
63+
# MAGIC %md **`lit`** can be used to create a column out of a value, which is useful for appending columns.
64+
65+
# COMMAND ----------
5666

57-
display(no_pii_df)
67+
display(gmail_accounts.select("email", lit(True).alias("gmail user")))
5868

5969
# COMMAND ----------
6070

61-
# MAGIC %md ### Non-aggregate and Miscellaneous Functions
62-
# MAGIC Here are a few additional non-aggregate and miscellaneous built-in functions.
71+
# MAGIC %md ### DataFrameNaFunctions
72+
# MAGIC <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a> is a DataFrame submodule with methods for handling null values. Obtain an instance of DataFrameNaFunctions by accessing the **`na`** attribute of a DataFrame.
6373
# MAGIC
6474
# MAGIC | Method | Description |
6575
# MAGIC | --- | --- |
66-
# MAGIC | col / column | Returns a Column based on the given column name. |
67-
# MAGIC | lit | Creates a Column of literal value |
68-
# MAGIC | isnull | Return true iff the column is null |
69-
# MAGIC | rand | Generate a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0) |
76+
# MAGIC | drop | Returns a new DataFrame omitting rows with any, all, or a specified number of null values, considering an optional subset of columns |
77+
# MAGIC | fill | Replace null values with the specified value for an optional subset of columns |
78+
# MAGIC | replace | Returns a new DataFrame replacing a value with another value, considering an optional subset of columns |
7079

7180
# COMMAND ----------
7281

73-
# MAGIC %md We could select a particular column using the **`col`** function
82+
# MAGIC %md
83+
# MAGIC Here we'll see the row count before and after dropping rows with null/NA values.
7484

7585
# COMMAND ----------
7686

77-
gmail_accounts = sales_df.filter(col("email").contains("gmail"))
87+
print(sales_df.count())
88+
print(sales_df.na.drop().count())
7889

79-
display(gmail_accounts)
90+
# COMMAND ----------
91+
92+
# MAGIC %md
93+
# MAGIC Since the row counts are the same, we have the no null columns. We'll need to explode items to find some nulls in columns such as items.coupon.
94+
95+
# COMMAND ----------
96+
97+
sales_exploded_df = sales_df.withColumn("items", explode(col("items")))
98+
display(sales_exploded_df.select("items.coupon"))
99+
print(sales_exploded_df.select("items.coupon").count())
100+
print(sales_exploded_df.select("items.coupon").na.drop().count())
101+
102+
# COMMAND ----------
103+
104+
# MAGIC %md We can fill in the missing coupon codes with **`na.fill`**
105+
106+
# COMMAND ----------
107+
108+
display(sales_exploded_df.select("items.coupon").na.fill("NO COUPON"))
80109

81110
# COMMAND ----------
82111

@@ -99,6 +128,25 @@
99128

100129
# COMMAND ----------
101130

131+
# MAGIC %md
132+
# MAGIC We'll load in our users data to join with our gmail_accounts from above.
133+
134+
# COMMAND ----------
135+
136+
users_df = spark.read.format("delta").load(users_path)
137+
display(users_df)
138+
139+
# COMMAND ----------
140+
141+
joined_df = gmail_accounts.join(other=users_df, on='email', how = "inner")
142+
display(joined_df)
143+
144+
# COMMAND ----------
145+
146+
classroom_cleanup()
147+
148+
# COMMAND ----------
149+
102150
# MAGIC %md-sandbox
103151
# MAGIC &copy; 2022 Databricks, Inc. All rights reserved.<br/>
104152
# MAGIC Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="https://www.apache.org/">Apache Software Foundation</a>.<br/>

Apache-Spark-Programming-with-Databricks/ASP 5 - Streaming/ASP 5.1cL - Activity by Traffic Lab.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
# COMMAND ----------
3737

3838
# MAGIC %md ### 1. Read data stream
39-
# MAGIC - Use schema stored in **`schema`**
4039
# MAGIC - Set to process 1 file per trigger
4140
# MAGIC - Read from Delta with filepath stored in **`events_path`**
4241
# MAGIC

Apache-Spark-Programming-with-Databricks/Solutions/ASP 2 - Spark Core/ASP 2.3L - Purchase Revenues Lab.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050

5151
# COMMAND ----------
5252

53+
from pyspark.sql.functions import col
5354
expected1 = [5830.0, 5485.0, 5289.0, 5219.1, 5180.0, 5175.0, 5125.0, 5030.0, 4985.0, 4985.0]
5455
result1 = [row.revenue for row in revenue_df.sort(col("revenue").desc_nulls_last()).limit(10).collect()]
5556

Apache-Spark-Programming-with-Databricks/Solutions/ASP 3 - Functions/ASP 3.3 - Complex Types.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# MAGIC 1. Union DataFrames together
1818
# MAGIC
1919
# MAGIC ##### Methods
20-
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a>: **`unionByName`**
20+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a>:**`union`**, **`unionByName`**
2121
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html?#functions" target="_blank">Built-In Functions</a>:
2222
# MAGIC - Aggregate: **`collect_set`**
2323
# MAGIC - Collection: **`array_contains`**, **`element_at`**, **`explode`**
@@ -117,6 +117,33 @@
117117

118118
# COMMAND ----------
119119

120+
# MAGIC %md
121+
# MAGIC
122+
# MAGIC ##Union and unionByName
123+
# MAGIC <img src="https://files.training.databricks.com/images/icon_warn_32.png" alt="Warning"> The DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.union.html" target="_blank">**`union`**</a> method resolves columns by position, as in standard SQL. You should use it only if the two DataFrames have exactly the same schema, including the column order. In contrast, the DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.unionByName.html" target="_blank">**`unionByName`**</a> method resolves columns by name. This is equivalent to UNION ALL in SQL. Neither one will remove duplicates.
124+
# MAGIC
125+
# MAGIC Below is a check to see if the two dataframes have a matching schema where **`union`** would be appropriate
126+
127+
# COMMAND ----------
128+
129+
mattress_df.schema==size_df.schema
130+
131+
# COMMAND ----------
132+
133+
# MAGIC %md
134+
# MAGIC If we do get the two schemas to match with a simple **`select`** statement, then we can use a **`union`**
135+
136+
# COMMAND ----------
137+
138+
union_count = mattress_df.select("email").union(size_df.select("email")).count()
139+
140+
mattress_count = mattress_df.count()
141+
size_count = size_df.count()
142+
143+
mattress_count + size_count == union_count
144+
145+
# COMMAND ----------
146+
120147
# MAGIC %md ### Clean up classroom
121148
# MAGIC
122149
# MAGIC And lastly, we'll clean up the classroom.

Apache-Spark-Programming-with-Databricks/Solutions/ASP 3 - Functions/ASP 3.3L - Users.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,6 @@
9999
# MAGIC - Drop the **`details`** column
100100
# MAGIC
101101
# MAGIC Save the result as **`union_df`**.
102-
# MAGIC
103-
# MAGIC <img src="https://files.training.databricks.com/images/icon_warn_32.png" alt="Warning"> The DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.union.html" target="_blank">**`union`**</a> method resolves columns by position, as in standard SQL. You should use it only if the two DataFrames have exactly the same schema, including the column order. In contrast, the DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.unionByName.html" target="_blank">**`unionByName`**</a> method resolves columns by name.
104102

105103
# COMMAND ----------
106104

Apache-Spark-Programming-with-Databricks/Solutions/ASP 3 - Functions/ASP 3.4 - Additional Functions.py

Lines changed: 68 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
# MAGIC 1. Join DataFrames
1717
# MAGIC
1818
# MAGIC ##### Methods
19-
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a>: **`fill`**
19+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.join.html?highlight=dataframe%20join#pyspark.sql.DataFrame.join" target="_blank">DataFrame Methods </a>: **`join`**
20+
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a>: **`fill`**, **`drop`**
2021
# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html?#functions" target="_blank">Built-In Functions</a>:
2122
# MAGIC - Aggregate: **`collect_set`**
2223
# MAGIC - Collection: **`explode`**
@@ -32,51 +33,79 @@
3233

3334
# COMMAND ----------
3435

35-
# MAGIC %md ### DataFrameNaFunctions
36-
# MAGIC <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a> is a DataFrame submodule with methods for handling null values. Obtain an instance of DataFrameNaFunctions by accessing the **`na`** attribute of a DataFrame.
36+
sales_df = spark.read.format("delta").load(sales_path)
37+
display(sales_df)
38+
39+
# COMMAND ----------
40+
41+
# MAGIC %md ### Non-aggregate and Miscellaneous Functions
42+
# MAGIC Here are a few additional non-aggregate and miscellaneous built-in functions.
3743
# MAGIC
3844
# MAGIC | Method | Description |
3945
# MAGIC | --- | --- |
40-
# MAGIC | drop | Returns a new DataFrame omitting rows with any, all, or a specified number of null values, considering an optional subset of columns |
41-
# MAGIC | fill | Replace null values with the specified value for an optional subset of columns |
42-
# MAGIC | replace | Returns a new DataFrame replacing a value with another value, considering an optional subset of columns |
46+
# MAGIC | col / column | Returns a Column based on the given column name. |
47+
# MAGIC | lit | Creates a Column of literal value |
48+
# MAGIC | isnull | Return true iff the column is null |
49+
# MAGIC | rand | Generate a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0) |
4350

4451
# COMMAND ----------
4552

46-
sales_df = spark.read.format("delta").load(sales_path)
47-
display(sales_df)
53+
# MAGIC %md We could select a particular column using the **`col`** function
4854

4955
# COMMAND ----------
5056

51-
# MAGIC %md Let's say we need to remove the email addresses from our dataset.
57+
gmail_accounts = sales_df.filter(col("email").contains("gmail"))
58+
59+
display(gmail_accounts)
5260

5361
# COMMAND ----------
5462

55-
no_pii_df = sales_df.drop("email")
63+
# MAGIC %md **`lit`** can be used to create a column out of a value, which is useful for appending columns.
64+
65+
# COMMAND ----------
5666

57-
display(no_pii_df)
67+
display(gmail_accounts.select("email", lit(True).alias("gmail user")))
5868

5969
# COMMAND ----------
6070

61-
# MAGIC %md ### Non-aggregate and Miscellaneous Functions
62-
# MAGIC Here are a few additional non-aggregate and miscellaneous built-in functions.
71+
# MAGIC %md ### DataFrameNaFunctions
72+
# MAGIC <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a> is a DataFrame submodule with methods for handling null values. Obtain an instance of DataFrameNaFunctions by accessing the **`na`** attribute of a DataFrame.
6373
# MAGIC
6474
# MAGIC | Method | Description |
6575
# MAGIC | --- | --- |
66-
# MAGIC | col / column | Returns a Column based on the given column name. |
67-
# MAGIC | lit | Creates a Column of literal value |
68-
# MAGIC | isnull | Return true iff the column is null |
69-
# MAGIC | rand | Generate a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0) |
76+
# MAGIC | drop | Returns a new DataFrame omitting rows with any, all, or a specified number of null values, considering an optional subset of columns |
77+
# MAGIC | fill | Replace null values with the specified value for an optional subset of columns |
78+
# MAGIC | replace | Returns a new DataFrame replacing a value with another value, considering an optional subset of columns |
7079

7180
# COMMAND ----------
7281

73-
# MAGIC %md We could select a particular column using the **`col`** function
82+
# MAGIC %md
83+
# MAGIC Here we'll see the row count before and after dropping rows with null/NA values.
7484

7585
# COMMAND ----------
7686

77-
gmail_accounts = sales_df.filter(col("email").contains("gmail"))
87+
print(sales_df.count())
88+
print(sales_df.na.drop().count())
7889

79-
display(gmail_accounts)
90+
# COMMAND ----------
91+
92+
# MAGIC %md
93+
# MAGIC Since the row counts are the same, we have the no null columns. We'll need to explode items to find some nulls in columns such as items.coupon.
94+
95+
# COMMAND ----------
96+
97+
sales_exploded_df = sales_df.withColumn("items", explode(col("items")))
98+
display(sales_exploded_df.select("items.coupon"))
99+
print(sales_exploded_df.select("items.coupon").count())
100+
print(sales_exploded_df.select("items.coupon").na.drop().count())
101+
102+
# COMMAND ----------
103+
104+
# MAGIC %md We can fill in the missing coupon codes with **`na.fill`**
105+
106+
# COMMAND ----------
107+
108+
display(sales_exploded_df.select("items.coupon").na.fill("NO COUPON"))
80109

81110
# COMMAND ----------
82111

@@ -99,6 +128,25 @@
99128

100129
# COMMAND ----------
101130

131+
# MAGIC %md
132+
# MAGIC We'll load in our users data to join with our gmail_accounts from above.
133+
134+
# COMMAND ----------
135+
136+
users_df = spark.read.format("delta").load(users_path)
137+
display(users_df)
138+
139+
# COMMAND ----------
140+
141+
joined_df = gmail_accounts.join(other=users_df, on='email', how = "inner")
142+
display(joined_df)
143+
144+
# COMMAND ----------
145+
146+
classroom_cleanup()
147+
148+
# COMMAND ----------
149+
102150
# MAGIC %md-sandbox
103151
# MAGIC &copy; 2022 Databricks, Inc. All rights reserved.<br/>
104152
# MAGIC Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="https://www.apache.org/">Apache Software Foundation</a>.<br/>

0 commit comments

Comments
 (0)