mgiglia
diff --git a/‎Apache-Spark-Programming-with-Databricks/ASP 2 - Spark Core/ASP 2.3L - Purchase Revenues Lab.py‎
Lines changed: 1 addition & 0 deletions b/‎Apache-Spark-Programming-with-Databricks/ASP 2 - Spark Core/ASP 2.3L - Purchase Revenues Lab.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.3 - Complex Types.py‎
Lines changed: 28 additions & 1 deletion b/‎Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.3 - Complex Types.py‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.3L - Users.py‎
Lines changed: 0 additions & 2 deletions b/‎Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.3L - Users.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.4 - Additional Functions.py‎
Lines changed: 68 additions & 20 deletions b/‎Apache-Spark-Programming-with-Databricks/ASP 3 - Functions/ASP 3.4 - Additional Functions.py‎
Lines changed: 68 additions & 20 deletions
diff --git a/‎Apache-Spark-Programming-with-Databricks/ASP 5 - Streaming/ASP 5.1cL - Activity by Traffic Lab.py‎
Lines changed: 0 additions & 1 deletion b/‎Apache-Spark-Programming-with-Databricks/ASP 5 - Streaming/ASP 5.1cL - Activity by Traffic Lab.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎Apache-Spark-Programming-with-Databricks/Solutions/ASP 2 - Spark Core/ASP 2.3L - Purchase Revenues Lab.py‎
Lines changed: 1 addition & 0 deletions b/‎Apache-Spark-Programming-with-Databricks/Solutions/ASP 2 - Spark Core/ASP 2.3L - Purchase Revenues Lab.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Apache-Spark-Programming-with-Databricks/Solutions/ASP 3 - Functions/ASP 3.3 - Complex Types.py‎
Lines changed: 28 additions & 1 deletion b/‎Apache-Spark-Programming-with-Databricks/Solutions/ASP 3 - Functions/ASP 3.3 - Complex Types.py‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎Apache-Spark-Programming-with-Databricks/Solutions/ASP 3 - Functions/ASP 3.3L - Users.py‎
Lines changed: 0 additions & 2 deletions b/‎Apache-Spark-Programming-with-Databricks/Solutions/ASP 3 - Functions/ASP 3.3L - Users.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎Apache-Spark-Programming-with-Databricks/Solutions/ASP 3 - Functions/ASP 3.4 - Additional Functions.py‎
Lines changed: 68 additions & 20 deletions b/‎Apache-Spark-Programming-with-Databricks/Solutions/ASP 3 - Functions/ASP 3.4 - Additional Functions.py‎
Lines changed: 68 additions & 20 deletions
@@ -48,6 +48,7 @@
 
 # COMMAND ----------
 
+from pyspark.sql.functions import col
 expected1 = [5830.0, 5485.0, 5289.0, 5219.1, 5180.0, 5175.0, 5125.0, 5030.0, 4985.0, 4985.0]
 result1 = [row.revenue for row in revenue_df.sort(col("revenue").desc_nulls_last()).limit(10).collect()]
 
 
@@ -17,7 +17,7 @@
 # MAGIC 1. Union DataFrames together
 # MAGIC 
 # MAGIC ##### Methods
-# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a>: **`unionByName`**
+# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a>:**`union`**, **`unionByName`**
 # MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html?#functions" target="_blank">Built-In Functions</a>:
 # MAGIC - Aggregate: **`collect_set`**
 # MAGIC - Collection: **`array_contains`**, **`element_at`**, **`explode`**
@@ -117,6 +117,33 @@
 
 # COMMAND ----------
 
+# MAGIC %md
+# MAGIC 
+# MAGIC ##Union and unionByName
+# MAGIC <img src="https://files.training.databricks.com/images/icon_warn_32.png" alt="Warning"> The DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.union.html" target="_blank">**`union`**</a> method resolves columns by position, as in standard SQL. You should use it only if the two DataFrames have exactly the same schema, including the column order. In contrast, the DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.unionByName.html" target="_blank">**`unionByName`**</a> method resolves columns by name. This is equivalent to UNION ALL in SQL. Neither one will remove duplicates. 
+# MAGIC 
+# MAGIC Below is a check to see if the two dataframes have a matching schema where **`union`** would be appropriate
+
+# COMMAND ----------
+
+mattress_df.schema==size_df.schema
+
+# COMMAND ----------
+
+# MAGIC %md 
+# MAGIC If we do get the two schemas to match with a simple **`select`** statement, then we can use a **`union`**
+
+# COMMAND ----------
+
+union_count = mattress_df.select("email").union(size_df.select("email")).count()
+
+mattress_count = mattress_df.count()
+size_count = size_df.count()
+
+mattress_count + size_count == union_count
+
+# COMMAND ----------
+
 # MAGIC %md ### Clean up classroom
 # MAGIC 
 # MAGIC And lastly, we'll clean up the classroom.
 
@@ -100,8 +100,6 @@
 # MAGIC - Drop the **`details`** column
 # MAGIC 
 # MAGIC Save the result as **`union_df`**.
-# MAGIC 
-# MAGIC <img src="https://files.training.databricks.com/images/icon_warn_32.png" alt="Warning"> The DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.union.html" target="_blank">**`union`**</a> method resolves columns by position, as in standard SQL. You should use it only if the two DataFrames have exactly the same schema, including the column order. In contrast, the DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.unionByName.html" target="_blank">**`unionByName`**</a> method resolves columns by name.
 
 # COMMAND ----------
 
 
@@ -16,7 +16,8 @@
 # MAGIC 1. Join DataFrames
 # MAGIC 
 # MAGIC ##### Methods
-# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a>: **`fill`**
+# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.join.html?highlight=dataframe%20join#pyspark.sql.DataFrame.join" target="_blank">DataFrame Methods </a>: **`join`**
+# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a>: **`fill`**, **`drop`**
 # MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html?#functions" target="_blank">Built-In Functions</a>:
 # MAGIC - Aggregate: **`collect_set`**
 # MAGIC - Collection: **`explode`**
@@ -32,51 +33,79 @@
 
 # COMMAND ----------
 
-# MAGIC %md ### DataFrameNaFunctions
-# MAGIC <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a> is a DataFrame submodule with methods for handling null values. Obtain an instance of DataFrameNaFunctions by accessing the **`na`** attribute of a DataFrame.
+sales_df = spark.read.format("delta").load(sales_path)
+display(sales_df)
+
+# COMMAND ----------
+
+# MAGIC %md ### Non-aggregate and Miscellaneous Functions
+# MAGIC Here are a few additional non-aggregate and miscellaneous built-in functions.
 # MAGIC 
 # MAGIC | Method | Description |
 # MAGIC | --- | --- |
-# MAGIC | drop | Returns a new DataFrame omitting rows with any, all, or a specified number of null values, considering an optional subset of columns |
-# MAGIC | fill | Replace null values with the specified value for an optional subset of columns |
-# MAGIC | replace | Returns a new DataFrame replacing a value with another value, considering an optional subset of columns |
+# MAGIC | col / column | Returns a Column based on the given column name. |
+# MAGIC | lit | Creates a Column of literal value |
+# MAGIC | isnull | Return true iff the column is null |
+# MAGIC | rand | Generate a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0) |
 
 # COMMAND ----------
 
-sales_df = spark.read.format("delta").load(sales_path)
-display(sales_df)
+# MAGIC %md We could select a particular column using the **`col`** function
 
 # COMMAND ----------
 
-# MAGIC %md Let's say we need to remove the email addresses from our dataset. 
+gmail_accounts = sales_df.filter(col("email").contains("gmail"))
+
+display(gmail_accounts)
 
 # COMMAND ----------
 
-no_pii_df = sales_df.drop("email")
+# MAGIC %md **`lit`** can be used to create a column out of a value, which is useful for appending columns. 
+
+# COMMAND ----------
 
-display(no_pii_df)
+display(gmail_accounts.select("email", lit(True).alias("gmail user")))
 
 # COMMAND ----------
 
-# MAGIC %md ### Non-aggregate and Miscellaneous Functions
-# MAGIC Here are a few additional non-aggregate and miscellaneous built-in functions.
+# MAGIC %md ### DataFrameNaFunctions
+# MAGIC <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a> is a DataFrame submodule with methods for handling null values. Obtain an instance of DataFrameNaFunctions by accessing the **`na`** attribute of a DataFrame.
 # MAGIC 
 # MAGIC | Method | Description |
 # MAGIC | --- | --- |
-# MAGIC | col / column | Returns a Column based on the given column name. |
-# MAGIC | lit | Creates a Column of literal value |
-# MAGIC | isnull | Return true iff the column is null |
-# MAGIC | rand | Generate a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0) |
+# MAGIC | drop | Returns a new DataFrame omitting rows with any, all, or a specified number of null values, considering an optional subset of columns |
+# MAGIC | fill | Replace null values with the specified value for an optional subset of columns |
+# MAGIC | replace | Returns a new DataFrame replacing a value with another value, considering an optional subset of columns |
 
 # COMMAND ----------
 
-# MAGIC %md We could select a particular column using the **`col`** function
+# MAGIC %md
+# MAGIC Here we'll see the row count before and after dropping rows with null/NA values. 
 
 # COMMAND ----------
 
-gmail_accounts = sales_df.filter(col("email").contains("gmail"))
+print(sales_df.count())
+print(sales_df.na.drop().count())
 
-display(gmail_accounts)
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Since the row counts are the same, we have the no null columns. We'll need to explode items to find some nulls in columns such as items.coupon. 
+
+# COMMAND ----------
+
+sales_exploded_df = sales_df.withColumn("items", explode(col("items")))
+display(sales_exploded_df.select("items.coupon"))
+print(sales_exploded_df.select("items.coupon").count())
+print(sales_exploded_df.select("items.coupon").na.drop().count())
+
+# COMMAND ----------
+
+# MAGIC %md We can fill in the missing coupon codes with **`na.fill`**
+
+# COMMAND ----------
+
+display(sales_exploded_df.select("items.coupon").na.fill("NO COUPON"))
 
 # COMMAND ----------
 
@@ -99,6 +128,25 @@
 
 # COMMAND ----------
 
+# MAGIC %md
+# MAGIC We'll load in our users data to join with our gmail_accounts from above.
+
+# COMMAND ----------
+
+users_df = spark.read.format("delta").load(users_path)
+display(users_df)
+
+# COMMAND ----------
+
+joined_df = gmail_accounts.join(other=users_df, on='email', how = "inner")
+display(joined_df)
+
+# COMMAND ----------
+
+classroom_cleanup()
+
+# COMMAND ----------
+
 # MAGIC %md-sandbox
 # MAGIC &copy; 2022 Databricks, Inc. All rights reserved.<br/>
 # MAGIC Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="https://www.apache.org/">Apache Software Foundation</a>.<br/>
 
@@ -36,7 +36,6 @@
 # COMMAND ----------
 
 # MAGIC %md ### 1. Read data stream
-# MAGIC - Use schema stored in **`schema`**
 # MAGIC - Set to process 1 file per trigger
 # MAGIC - Read from Delta with filepath stored in **`events_path`**
 # MAGIC 
 
@@ -50,6 +50,7 @@
 
 # COMMAND ----------
 
+from pyspark.sql.functions import col
 expected1 = [5830.0, 5485.0, 5289.0, 5219.1, 5180.0, 5175.0, 5125.0, 5030.0, 4985.0, 4985.0]
 result1 = [row.revenue for row in revenue_df.sort(col("revenue").desc_nulls_last()).limit(10).collect()]
 
 
@@ -17,7 +17,7 @@
 # MAGIC 1. Union DataFrames together
 # MAGIC 
 # MAGIC ##### Methods
-# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a>: **`unionByName`**
+# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">DataFrame</a>:**`union`**, **`unionByName`**
 # MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html?#functions" target="_blank">Built-In Functions</a>:
 # MAGIC - Aggregate: **`collect_set`**
 # MAGIC - Collection: **`array_contains`**, **`element_at`**, **`explode`**
@@ -117,6 +117,33 @@
 
 # COMMAND ----------
 
+# MAGIC %md
+# MAGIC 
+# MAGIC ##Union and unionByName
+# MAGIC <img src="https://files.training.databricks.com/images/icon_warn_32.png" alt="Warning"> The DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.union.html" target="_blank">**`union`**</a> method resolves columns by position, as in standard SQL. You should use it only if the two DataFrames have exactly the same schema, including the column order. In contrast, the DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.unionByName.html" target="_blank">**`unionByName`**</a> method resolves columns by name. This is equivalent to UNION ALL in SQL. Neither one will remove duplicates. 
+# MAGIC 
+# MAGIC Below is a check to see if the two dataframes have a matching schema where **`union`** would be appropriate
+
+# COMMAND ----------
+
+mattress_df.schema==size_df.schema
+
+# COMMAND ----------
+
+# MAGIC %md 
+# MAGIC If we do get the two schemas to match with a simple **`select`** statement, then we can use a **`union`**
+
+# COMMAND ----------
+
+union_count = mattress_df.select("email").union(size_df.select("email")).count()
+
+mattress_count = mattress_df.count()
+size_count = size_df.count()
+
+mattress_count + size_count == union_count
+
+# COMMAND ----------
+
 # MAGIC %md ### Clean up classroom
 # MAGIC 
 # MAGIC And lastly, we'll clean up the classroom.
 
@@ -99,8 +99,6 @@
 # MAGIC - Drop the **`details`** column
 # MAGIC 
 # MAGIC Save the result as **`union_df`**.
-# MAGIC 
-# MAGIC <img src="https://files.training.databricks.com/images/icon_warn_32.png" alt="Warning"> The DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.union.html" target="_blank">**`union`**</a> method resolves columns by position, as in standard SQL. You should use it only if the two DataFrames have exactly the same schema, including the column order. In contrast, the DataFrame <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.unionByName.html" target="_blank">**`unionByName`**</a> method resolves columns by name.
 
 # COMMAND ----------
 
 
@@ -16,7 +16,8 @@
 # MAGIC 1. Join DataFrames
 # MAGIC 
 # MAGIC ##### Methods
-# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a>: **`fill`**
+# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.join.html?highlight=dataframe%20join#pyspark.sql.DataFrame.join" target="_blank">DataFrame Methods </a>: **`join`**
+# MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a>: **`fill`**, **`drop`**
 # MAGIC - <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html?#functions" target="_blank">Built-In Functions</a>:
 # MAGIC - Aggregate: **`collect_set`**
 # MAGIC - Collection: **`explode`**
@@ -32,51 +33,79 @@
 
 # COMMAND ----------
 
-# MAGIC %md ### DataFrameNaFunctions
-# MAGIC <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a> is a DataFrame submodule with methods for handling null values. Obtain an instance of DataFrameNaFunctions by accessing the **`na`** attribute of a DataFrame.
+sales_df = spark.read.format("delta").load(sales_path)
+display(sales_df)
+
+# COMMAND ----------
+
+# MAGIC %md ### Non-aggregate and Miscellaneous Functions
+# MAGIC Here are a few additional non-aggregate and miscellaneous built-in functions.
 # MAGIC 
 # MAGIC | Method | Description |
 # MAGIC | --- | --- |
-# MAGIC | drop | Returns a new DataFrame omitting rows with any, all, or a specified number of null values, considering an optional subset of columns |
-# MAGIC | fill | Replace null values with the specified value for an optional subset of columns |
-# MAGIC | replace | Returns a new DataFrame replacing a value with another value, considering an optional subset of columns |
+# MAGIC | col / column | Returns a Column based on the given column name. |
+# MAGIC | lit | Creates a Column of literal value |
+# MAGIC | isnull | Return true iff the column is null |
+# MAGIC | rand | Generate a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0) |
 
 # COMMAND ----------
 
-sales_df = spark.read.format("delta").load(sales_path)
-display(sales_df)
+# MAGIC %md We could select a particular column using the **`col`** function
 
 # COMMAND ----------
 
-# MAGIC %md Let's say we need to remove the email addresses from our dataset. 
+gmail_accounts = sales_df.filter(col("email").contains("gmail"))
+
+display(gmail_accounts)
 
 # COMMAND ----------
 
-no_pii_df = sales_df.drop("email")
+# MAGIC %md **`lit`** can be used to create a column out of a value, which is useful for appending columns. 
+
+# COMMAND ----------
 
-display(no_pii_df)
+display(gmail_accounts.select("email", lit(True).alias("gmail user")))
 
 # COMMAND ----------
 
-# MAGIC %md ### Non-aggregate and Miscellaneous Functions
-# MAGIC Here are a few additional non-aggregate and miscellaneous built-in functions.
+# MAGIC %md ### DataFrameNaFunctions
+# MAGIC <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameNaFunctions.html" target="_blank">DataFrameNaFunctions</a> is a DataFrame submodule with methods for handling null values. Obtain an instance of DataFrameNaFunctions by accessing the **`na`** attribute of a DataFrame.
 # MAGIC 
 # MAGIC | Method | Description |
 # MAGIC | --- | --- |
-# MAGIC | col / column | Returns a Column based on the given column name. |
-# MAGIC | lit | Creates a Column of literal value |
-# MAGIC | isnull | Return true iff the column is null |
-# MAGIC | rand | Generate a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0) |
+# MAGIC | drop | Returns a new DataFrame omitting rows with any, all, or a specified number of null values, considering an optional subset of columns |
+# MAGIC | fill | Replace null values with the specified value for an optional subset of columns |
+# MAGIC | replace | Returns a new DataFrame replacing a value with another value, considering an optional subset of columns |
 
 # COMMAND ----------
 
-# MAGIC %md We could select a particular column using the **`col`** function
+# MAGIC %md
+# MAGIC Here we'll see the row count before and after dropping rows with null/NA values. 
 
 # COMMAND ----------
 
-gmail_accounts = sales_df.filter(col("email").contains("gmail"))
+print(sales_df.count())
+print(sales_df.na.drop().count())
 
-display(gmail_accounts)
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Since the row counts are the same, we have the no null columns. We'll need to explode items to find some nulls in columns such as items.coupon. 
+
+# COMMAND ----------
+
+sales_exploded_df = sales_df.withColumn("items", explode(col("items")))
+display(sales_exploded_df.select("items.coupon"))
+print(sales_exploded_df.select("items.coupon").count())
+print(sales_exploded_df.select("items.coupon").na.drop().count())
+
+# COMMAND ----------
+
+# MAGIC %md We can fill in the missing coupon codes with **`na.fill`**
+
+# COMMAND ----------
+
+display(sales_exploded_df.select("items.coupon").na.fill("NO COUPON"))
 
 # COMMAND ----------
 
@@ -99,6 +128,25 @@
 
 # COMMAND ----------
 
+# MAGIC %md
+# MAGIC We'll load in our users data to join with our gmail_accounts from above.
+
+# COMMAND ----------
+
+users_df = spark.read.format("delta").load(users_path)
+display(users_df)
+
+# COMMAND ----------
+
+joined_df = gmail_accounts.join(other=users_df, on='email', how = "inner")
+display(joined_df)
+
+# COMMAND ----------
+
+classroom_cleanup()
+
+# COMMAND ----------
+
 # MAGIC %md-sandbox
 # MAGIC &copy; 2022 Databricks, Inc. All rights reserved.<br/>
 # MAGIC Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="https://www.apache.org/">Apache Software Foundation</a>.<br/>