|
| 1 | +# Databricks notebook source |
| 2 | +# MAGIC |
| 3 | +# MAGIC %md-sandbox |
| 4 | +# MAGIC <div style="text-align: center; line-height: 0; padding-top: 9px;"> |
| 5 | +# MAGIC <img src="https://databricks.com/wp-content/uploads/2018/03/db-academy-rgb-1200px.png" alt="Databricks Learning" style="width: 400px"> |
| 6 | +# MAGIC </div> |
| 7 | + |
| 8 | +# COMMAND ---------- |
| 9 | + |
| 10 | +# MAGIC %md |
| 11 | +# MAGIC # Spark SQL |
| 12 | +# MAGIC 1. Run a SQL query |
| 13 | +# MAGIC 1. Create DataFrame from table |
| 14 | +# MAGIC 1. Write same query using DataFrame transformations |
| 15 | +# MAGIC 1. Trigger computation with DataFrame actions |
| 16 | +# MAGIC 1. Convert between DataFrames and SQL |
| 17 | +# MAGIC |
| 18 | +# MAGIC ##### Methods |
| 19 | +# MAGIC - SparkSession (<a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#spark-session-apis" target="_blank">Python</a>/<a href="http://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/SparkSession.html" target="_blank">Scala</a>): `sql`, `table` |
| 20 | +# MAGIC - DataFrame (<a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" target="_blank">Python</a>/<a href="http://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Dataset.html" target="_blank">Scala</a>): |
| 21 | +# MAGIC - Transformations: `select`, `where`, `orderBy` |
| 22 | +# MAGIC - Actions: `show`, `count`, `take` |
| 23 | +# MAGIC - Other methods: `printSchema`, `schema`, `createOrReplaceTempView` |
| 24 | + |
| 25 | +# COMMAND ---------- |
| 26 | + |
| 27 | +# MAGIC %run ./Includes/Classroom-Setup-SQL |
| 28 | + |
| 29 | +# COMMAND ---------- |
| 30 | + |
| 31 | +# MAGIC %md |
| 32 | +# MAGIC ###  Run a SQL query |
| 33 | +# MAGIC Use `SparkSession` to run SQL |
| 34 | + |
| 35 | +# COMMAND ---------- |
| 36 | + |
| 37 | +budgetDF = spark.sql(""" |
| 38 | +SELECT name, price |
| 39 | +FROM products |
| 40 | +WHERE price < 200 |
| 41 | +ORDER BY price |
| 42 | +""") |
| 43 | + |
| 44 | +# COMMAND ---------- |
| 45 | + |
| 46 | +# MAGIC %md |
| 47 | +# MAGIC View results in the returned DataFrame |
| 48 | + |
| 49 | +# COMMAND ---------- |
| 50 | + |
| 51 | +budgetDF.show() |
| 52 | + |
| 53 | +# COMMAND ---------- |
| 54 | + |
| 55 | +display(budgetDF) |
| 56 | + |
| 57 | +# COMMAND ---------- |
| 58 | + |
| 59 | +# MAGIC %md |
| 60 | +# MAGIC ###  Create a DataFrame |
| 61 | +# MAGIC Use `SparkSession` to create a DataFrame from a table |
| 62 | + |
| 63 | +# COMMAND ---------- |
| 64 | + |
| 65 | +productsDF = spark.table("products") |
| 66 | +display(productsDF) |
| 67 | + |
| 68 | +# COMMAND ---------- |
| 69 | + |
| 70 | +# MAGIC %md |
| 71 | +# MAGIC Access schema of DataFrame |
| 72 | + |
| 73 | +# COMMAND ---------- |
| 74 | + |
| 75 | +productsDF.printSchema() |
| 76 | + |
| 77 | +# COMMAND ---------- |
| 78 | + |
| 79 | +productsDF.schema |
| 80 | + |
| 81 | +# COMMAND ---------- |
| 82 | + |
| 83 | +# MAGIC %md |
| 84 | +# MAGIC ###  Write same query with DataFrame transformations |
| 85 | + |
| 86 | +# COMMAND ---------- |
| 87 | + |
| 88 | +budgetDF = (productsDF |
| 89 | + .select("name", "price") |
| 90 | + .where("price < 200") |
| 91 | + .orderBy("price") |
| 92 | +) |
| 93 | + |
| 94 | +# COMMAND ---------- |
| 95 | + |
| 96 | +# MAGIC %md |
| 97 | +# MAGIC ###  Trigger computation with DataFrame actions |
| 98 | + |
| 99 | +# COMMAND ---------- |
| 100 | + |
| 101 | +budgetDF.count() |
| 102 | + |
| 103 | +# COMMAND ---------- |
| 104 | + |
| 105 | +budgetDF.take(2) |
| 106 | + |
| 107 | +# COMMAND ---------- |
| 108 | + |
| 109 | +# MAGIC %md |
| 110 | +# MAGIC ###  Convert between DataFrames and SQL |
| 111 | + |
| 112 | +# COMMAND ---------- |
| 113 | + |
| 114 | +budgetDF.createOrReplaceTempView("budget") |
| 115 | + |
| 116 | +# COMMAND ---------- |
| 117 | + |
| 118 | +display(spark.sql("SELECT * FROM budget")) |
| 119 | + |
| 120 | +# COMMAND ---------- |
| 121 | + |
| 122 | +# MAGIC %md |
| 123 | +# MAGIC ##  Spark SQL Lab |
| 124 | +# MAGIC |
| 125 | +# MAGIC 1. Create a DataFrame from the `Event` table |
| 126 | +# MAGIC 1. Display DataFrame and inspect schema |
| 127 | +# MAGIC 1. Apply transformations to filter and sort `macOS` events |
| 128 | +# MAGIC 1. Count results and take first 5 rows |
| 129 | +# MAGIC 1. Create the same DataFrame using SQL query |
| 130 | +# MAGIC |
| 131 | +# MAGIC ##### Methods |
| 132 | +# MAGIC - <a href="http://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/SparkSession.html" target="_blank">SparkSession</a>: `sql`, `table` |
| 133 | +# MAGIC - <a href="http://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Dataset.html" target="_blank">DataFrame</a> transformations: `select`, `where`, `orderBy` |
| 134 | +# MAGIC - <a href="http://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Dataset.html" target="_blank">DataFrame</a> actions: `select`, `count`, `take` |
| 135 | +# MAGIC - Other <a href="http://spark.apache.org/docs/latest/api/scala/org/apache/spark/sql/Dataset.html" target="_blank">DataFrame</a> methods: `printSchema`, `schema`, `createOrReplaceTempView` |
| 136 | + |
| 137 | +# COMMAND ---------- |
| 138 | + |
| 139 | +# MAGIC %md |
| 140 | +# MAGIC ### 1. Create a DataFrame from the `events` table |
| 141 | +# MAGIC - Use SparkSession to create a DataFrame from the `events` table |
| 142 | + |
| 143 | +# COMMAND ---------- |
| 144 | + |
| 145 | +# TODO |
| 146 | +eventsDF = FILL_IN |
| 147 | + |
| 148 | +# COMMAND ---------- |
| 149 | + |
| 150 | +# MAGIC %md |
| 151 | +# MAGIC ### 2. Display DataFrame and inspect schema |
| 152 | +# MAGIC - Use methods above to inspect DataFrame contents and schema |
| 153 | + |
| 154 | +# COMMAND ---------- |
| 155 | + |
| 156 | +# TODO |
| 157 | + |
| 158 | +# COMMAND ---------- |
| 159 | + |
| 160 | +# TODO |
| 161 | + |
| 162 | +# COMMAND ---------- |
| 163 | + |
| 164 | +# MAGIC %md-sandbox |
| 165 | +# MAGIC ### 3. Apply transformations to filter and sort `macOS` events |
| 166 | +# MAGIC - Filter for rows where `device` is `macOS` |
| 167 | +# MAGIC - Sort rows by `event_timestamp` |
| 168 | +# MAGIC |
| 169 | +# MAGIC <img alt="Hint" title="Hint" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-light-bulb.svg"/> **Hint:** Use single and double quotes in your filter SQL expression |
| 170 | + |
| 171 | +# COMMAND ---------- |
| 172 | + |
| 173 | +# TODO |
| 174 | +macDF = (eventsDF |
| 175 | + .FILL_IN |
| 176 | +) |
| 177 | + |
| 178 | +# COMMAND ---------- |
| 179 | + |
| 180 | +# MAGIC %md |
| 181 | +# MAGIC ### 4. Count results and take first 5 rows |
| 182 | +# MAGIC - Use DataFrame actions to count and take rows |
| 183 | + |
| 184 | +# COMMAND ---------- |
| 185 | + |
| 186 | +# TODO |
| 187 | +numRows = macDF.FILL_IN |
| 188 | +rows = macDF.FILL_IN |
| 189 | + |
| 190 | +# COMMAND ---------- |
| 191 | + |
| 192 | +# MAGIC %md-sandbox |
| 193 | +# MAGIC ##### <img alt="Best Practice" title="Best Practice" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-blue-ribbon.svg"/> Check your work |
| 194 | + |
| 195 | +# COMMAND ---------- |
| 196 | + |
| 197 | +from pyspark.sql import Row |
| 198 | + |
| 199 | +assert(numRows == 1938215) |
| 200 | +assert(len(rows) == 5) |
| 201 | +assert(type(rows[0]) == Row) |
| 202 | + |
| 203 | +# COMMAND ---------- |
| 204 | + |
| 205 | +# MAGIC %md |
| 206 | +# MAGIC ### 5. Create the same DataFrame using SQL query |
| 207 | +# MAGIC - Use SparkSession to run a sql query on the `events` table |
| 208 | +# MAGIC - Use SQL commands above to write the same filter and sort query used earlier |
| 209 | + |
| 210 | +# COMMAND ---------- |
| 211 | + |
| 212 | +# TODO |
| 213 | +macSQLDF = spark.FILL_IN |
| 214 | + |
| 215 | +display(macSQLDF) |
| 216 | + |
| 217 | +# COMMAND ---------- |
| 218 | + |
| 219 | +# MAGIC %md-sandbox |
| 220 | +# MAGIC %md ##### <img alt="Best Practice" title="Best Practice" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-blue-ribbon.svg"/> Check your work |
| 221 | +# MAGIC - You should only see `macOS` values in the `device` column |
| 222 | +# MAGIC - The fifth row should be an event with timestamp `1592539226602157` |
| 223 | + |
| 224 | +# COMMAND ---------- |
| 225 | + |
| 226 | +# MAGIC %md |
| 227 | +# MAGIC ### Classroom Cleanup |
| 228 | + |
| 229 | +# COMMAND ---------- |
| 230 | + |
| 231 | +# MAGIC %run ./Includes/Classroom-Cleanup |
0 commit comments