HowardRiddiough
diff --git a/‎deploying-python-ml-in-pyspark.ipynb‎
Lines changed: 10 additions & 8 deletions b/‎deploying-python-ml-in-pyspark.ipynb‎
Lines changed: 10 additions & 8 deletions
@@ -28,7 +28,7 @@
  "import pandas as pd\n",
  "from sklearn.ensemble import RandomForestRegressor\n",
  "from sklearn.pipeline import Pipeline\n",
- "from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder\n",
+ "from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder\n",
  "from sklearn.compose import ColumnTransformer\n",
  "import pyspark.sql\n",
  "from pyspark.sql import SparkSession\n",
@@ -136,21 +136,23 @@
  "metadata": {},
  "outputs": [],
  "source": [
- "def spark_predict(model, *cols) -> pyspark.sql.column:\n",
+ "def spark_predict(model, cols) -> pyspark.sql.column:\n",
  " \"\"\"This function deploys python ml in PySpark using the `predict` method of the `model` parameter.\n",
  " \n",
  " Args:\n",
  " model: python ml model with sklearn API\n",
- " *cols (list-like): Features used for predictions, required to be present as columns in the spark \n",
+ " cols (list-like): Features used for predictions, required to be present as columns in the spark \n",
  " DataFrame used to make predictions.\n",
  " \"\"\"\n",
  " @sf.pandas_udf(returnType=DoubleType())\n",
  " def predict_pandas_udf(*cols):\n",
- " # cols will be a tuple of pandas.Series here.\n",
  " X = pd.concat(cols, axis=1)\n",
  " return pd.Series(model.predict(X))\n",
  " \n",
- " return predict_pandas_udf(*cols)"
+ " return predict_pandas_udf(*cols)\n",
+ "\n",
+ " \n",
+ " "
  ]
  },
  {
@@ -184,7 +186,7 @@
  "(\n",
  " ddf\n",
  " .select(NUMERICAL_FEATURES + [TARGET])\n",
- " .withColumn(\"prediction\", spark_predict(rf, *NUMERICAL_FEATURES).alias(\"prediction\"))\n",
+ " .withColumn(\"prediction\", spark_predict(rf, NUMERICAL_FEATURES).alias(\"prediction\"))\n",
  " .show(5)\n",
  ")"
  ]
@@ -230,7 +232,7 @@
  "(\n",
  " ddf\n",
  " .select(NUMERICAL_FEATURES + [TARGET])\n",
- " .withColumn(\"pipe_predict\", spark_predict(pipe, *NUMERICAL_FEATURES).alias(\"prediction\")).show(5)\n",
+ " .withColumn(\"pipe_predict\", spark_predict(pipe, NUMERICAL_FEATURES).alias(\"prediction\")).show(5)\n",
  ")"
  ]
  },
@@ -285,7 +287,7 @@
  "(\n",
  " ddf\n",
  " .select(ALL_FEATURES + [TARGET])\n",
- " .withColumn(\"pipe_predict\", spark_predict(preprocessor_pipe, *ALL_FEATURES).alias(\"prediction\"))\n",
+ " .withColumn(\"pipe_predict\", spark_predict(preprocessor_pipe, ALL_FEATURES).alias(\"prediction\"))\n",
  " .show(5)\n",
  ")"
  ]