p5149247263
diff --git a/‎model_analysis/feature_importance_tutorial.ipynb‎
Lines changed: 168 additions & 0 deletions b/‎model_analysis/feature_importance_tutorial.ipynb‎
Lines changed: 168 additions & 0 deletions
@@ -0,0 +1,168 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# $$CatBoost\\ Feature\\ Importance\\ Tutorial$$"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Sometimes it is very important to understand which feature made the greatest contribution to the final result. To do this, the CatBoost model has a get_feature_importance method."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "from catboost import CatBoost, Pool, datasets\n",
+ "from sklearn.model_selection import train_test_split"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true
+ },
+ "source": [
+ "#### First, let's prepare the dataset:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_df, _ = datasets.higgs()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X, y = np.array(train_df.drop(0, axis=1))[:1000], np.array(train_df[0])[:1000]\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)\n",
+ "train_pool = Pool(X_train, y_train)\n",
+ "test_pool = Pool(X_test, y_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Let's train CatBoost:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "cb = CatBoost({'iterations': 20, 'verbose': False, 'random_seed': 42, 'grow_policy': 'Lossguide'})\n",
+ "cb.fit(train_pool);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Catboost provides several types of feature importances. One of them is PredictionDiff: A vector with contributions of each feature to the RawFormulaVal difference for each pair of objects."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Let's find two objects with incorrect labels on test data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prediction = np.argmax(cb.predict(X_test, prediction_type='Probability'), axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wrong_prediction_idxs = np.arange(prediction.size)[y_test != prediction]\n",
+ "test_pool_slice = test_pool.slice(wrong_prediction_idxs[:2])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Let's calculate PredictionDiff for these two objects:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "22: 0.590958854452\n",
+ "25: 0.706977071538\n"
+ ]
+ }
+ ],
+ "source": [
+ "prediction_diff = cb.get_feature_importance(type='PredictionDiff', data=test_pool_slice)\n",
+ "\n",
+ "for feature_id, diff in np.ndenumerate(prediction_diff):\n",
+ " if diff > 0.:\n",
+ " print('{}: {}'.format(feature_id[0], diff))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### As you can see, feature 25 is most important for getting the right prediction."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.17"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}