{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b332354c",
   "metadata": {
    "id": "b332354c"
   },
   "source": [
    "### Here is an example notebook on Boosted Decision Trees"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "67e6d520",
   "metadata": {
    "id": "67e6d520"
   },
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "4a48cdef",
   "metadata": {
    "id": "4a48cdef"
   },
   "source": [
    "So let us start by importing some packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6867b53",
   "metadata": {
    "executionInfo": {
     "elapsed": 531,
     "status": "ok",
     "timestamp": 1707468707750,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "b6867b53"
   },
   "outputs": [],
   "source": [
    "#%matplotlib inline\n",
    "\n",
    "import random\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import matplotlib.patches as mpatches\n",
    "\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "from matplotlib.colors import ListedColormap\n",
    "from matplotlib import cm\n",
    "from mpl_toolkits.mplot3d.axes3d import get_test_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65a73fb5",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 256
    },
    "executionInfo": {
     "elapsed": 2310,
     "status": "ok",
     "timestamp": 1707468711944,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "65a73fb5",
    "outputId": "3ae5f044-be37-4efb-8aaa-2bc4ab10304a"
   },
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(\"training.zip\", compression='zip')\n",
    "\n",
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f77b4b8c",
   "metadata": {
    "executionInfo": {
     "elapsed": 661,
     "status": "ok",
     "timestamp": 1707468714046,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "f77b4b8c"
   },
   "outputs": [],
   "source": [
    "df_train_signal = df_train[df_train.Label == 's']\n",
    "df_train_background = df_train[df_train.Label == 'b']\n",
    "#verify with the head option"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76d9ee0e",
   "metadata": {
    "executionInfo": {
     "elapsed": 381,
     "status": "ok",
     "timestamp": 1707468717957,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "76d9ee0e"
   },
   "outputs": [],
   "source": [
    "s_DER_mass_transverse_met_lep = df_train_signal[\"DER_mass_transverse_met_lep\"].to_numpy()\n",
    "b_DER_mass_transverse_met_lep = df_train_background[\"DER_mass_transverse_met_lep\"].to_numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03632d5e",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 483
    },
    "executionInfo": {
     "elapsed": 7,
     "status": "ok",
     "timestamp": 1707468720351,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "03632d5e",
    "outputId": "d752bb17-cd7d-4a93-d81d-8cdc9cc99011"
   },
   "outputs": [],
   "source": [
    "plt.hist([s_DER_mass_transverse_met_lep, b_DER_mass_transverse_met_lep], bins=50, range=[0,200], alpha=0.5, label=['sig', 'bkg'])\n",
    "plt.xlabel('DER_mass_transverse_met_lep', labelpad=15)\n",
    "plt.legend(loc='upper right')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "479d6b0a",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 483
    },
    "executionInfo": {
     "elapsed": 1211,
     "status": "ok",
     "timestamp": 1707468725043,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "479d6b0a",
    "outputId": "48575ad2-d422-4aec-ca41-bd6aa2704828"
   },
   "outputs": [],
   "source": [
    "s_DER_pt_h = df_train_signal[\"DER_pt_h\"].to_numpy()\n",
    "b_DER_pt_h = df_train_background[\"DER_pt_h\"].to_numpy()\n",
    "plt.hist([s_DER_pt_h, b_DER_pt_h], bins=50, range=[0,200], alpha=0.5, label=['sig', 'bkg'])\n",
    "plt.xlabel('DER_pt_h', labelpad=15)\n",
    "plt.legend(loc='upper right')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7052c8cf",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 535,
     "status": "ok",
     "timestamp": 1707468728722,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "7052c8cf",
    "outputId": "0df4ad8f-7876-452c-dcee-35d5965945b7"
   },
   "outputs": [],
   "source": [
    "df_train = df_train.replace({'Label': {'s': 1, 'b': 0}})\n",
    "#X_train = np.array(df_train.drop(['EventId', 'Label', 'Weight'], axis=1).astype(float))\n",
    "X_train = np.array(df_train.filter(like='DER'))\n",
    "Y_train = np.array(df_train['Label'])\n",
    "Weights = np.array(df_train['Weight'])\n",
    "#X_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "jXiFOBfecyzA",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 455,
     "status": "ok",
     "timestamp": 1707468733461,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "jXiFOBfecyzA",
    "outputId": "6f69934a-97fe-4f82-8906-7c39068e9243"
   },
   "outputs": [],
   "source": [
    "Weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "jetOMHUncUvF",
   "metadata": {
    "executionInfo": {
     "elapsed": 496,
     "status": "ok",
     "timestamp": 1707468737101,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "jetOMHUncUvF"
   },
   "outputs": [],
   "source": [
    "Weights[Y_train==0] *= 200000/Weights[Y_train==0].sum()\n",
    "Weights[Y_train==1] *= 200000/Weights[Y_train==1].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "V4SediH6c07P",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 2,
     "status": "ok",
     "timestamp": 1707468738825,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "V4SediH6c07P",
    "outputId": "e4502f87-b5c2-42b1-c1de-915f77b60e4c"
   },
   "outputs": [],
   "source": [
    "Weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bf9cf78",
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sn # seaborn for nice plot quicker\n",
    "print (\"Signal feature correlation matrix\")\n",
    "corrMatrix = df_train[df_train.Label>0.5].corr()\n",
    "sn.heatmap(corrMatrix, annot=True)\n",
    "plt.show()\n",
    "\n",
    "print (\"Background feature correlation matrix\")\n",
    "corrMatrix = df_train[df_train.Label<0.5].corr()\n",
    "sn.heatmap(corrMatrix, annot=True)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ba4bd9c4",
   "metadata": {
    "id": "ba4bd9c4"
   },
   "source": [
    "Now lets split our Inputs and labels randomly training and testing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a4cbaad",
   "metadata": {
    "executionInfo": {
     "elapsed": 526,
     "status": "ok",
     "timestamp": 1707468741825,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "9a4cbaad"
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "304ac494",
   "metadata": {
    "executionInfo": {
     "elapsed": 381,
     "status": "ok",
     "timestamp": 1707468744822,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "304ac494"
   },
   "outputs": [],
   "source": [
    "x_train,x_test, y_train,y_test, w_train, w_test = train_test_split(X_train, Y_train, Weights,\n",
    "                                                  test_size=0.20, random_state=45)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f68a371d",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 414,
     "status": "ok",
     "timestamp": 1707468746809,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "f68a371d",
    "outputId": "be40b70e-e8a4-44c6-ec90-cf7401b358e7"
   },
   "outputs": [],
   "source": [
    "len(x_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "J3t0Ua-ISrIB",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 4,
     "status": "ok",
     "timestamp": 1707468748793,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "J3t0Ua-ISrIB",
    "outputId": "6b397cd6-0d13-41fb-b602-487b6a5a5a64"
   },
   "outputs": [],
   "source": [
    "len(y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "iy_jgwTZStP7",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 388,
     "status": "ok",
     "timestamp": 1707468751304,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "iy_jgwTZStP7",
    "outputId": "86cd8618-c111-43fd-dad9-b197e0b3a5d1"
   },
   "outputs": [],
   "source": [
    "len(w_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ea38ec0",
   "metadata": {},
   "source": [
    "We will try two BDT algorithms, one from SKLearn and other is XGBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8607126c",
   "metadata": {
    "executionInfo": {
     "elapsed": 690,
     "status": "ok",
     "timestamp": 1707468754413,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "8607126c"
   },
   "outputs": [],
   "source": [
    "# BDT from SKLearn\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "model_gbc = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, min_samples_split=10, min_samples_leaf=2)\n",
    "model_gbc = model_gbc.fit(x_train, y_train, w_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b1ed8d9",
   "metadata": {
    "executionInfo": {
     "elapsed": 617,
     "status": "ok",
     "timestamp": 1707468757877,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "8b1ed8d9"
   },
   "outputs": [],
   "source": [
    "print ('from test sample')\n",
    "test_scores_gbc = model_gbc.predict_proba(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a178c74d",
   "metadata": {
    "executionInfo": {
     "elapsed": 2,
     "status": "ok",
     "timestamp": 1707468760176,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "a178c74d"
   },
   "outputs": [],
   "source": [
    "train_scores_gbc = model_gbc.predict_proba(x_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6230066",
   "metadata": {},
   "outputs": [],
   "source": [
    "# make ROC curve\n",
    "from sklearn.metrics import roc_curve,auc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1121e540",
   "metadata": {},
   "outputs": [],
   "source": [
    "fpr_tr_gbc, tpr_tr_gbc, tresholds_tr_gbc = roc_curve(y_train, train_scores_gbc[:,1],pos_label=None, sample_weight=w_train)\n",
    "fpr_te_gbc, tpr_te_gbc, tresholds_te_gbc = roc_curve(y_test, test_scores_gbc[:,1], pos_label=None, sample_weight=w_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0c960b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_auc_gbc = auc(fpr_te_gbc, tpr_te_gbc)\n",
    "print(\"BDT test set auc - {}\".format(test_auc_gbc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc0c3097",
   "metadata": {
    "executionInfo": {
     "elapsed": 400,
     "status": "ok",
     "timestamp": 1707468762793,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "bc0c3097"
   },
   "outputs": [],
   "source": [
    "# Now try BDT from XGBoost\n",
    "import xgboost as xgb\n",
    "cls=xgb.XGBClassifier(learning_rate=0.1, max_depth=3, min_child_weight=10, n_estimators=100, objective='binary:logistic', verbosity=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6aee8107",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 670,
     "status": "ok",
     "timestamp": 1707468765714,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "6aee8107",
    "outputId": "561a0578-c102-4867-ca0c-4db6d22f154a"
   },
   "outputs": [],
   "source": [
    "cls.fit(x_train, y_train, w_train) #training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90c45cd4",
   "metadata": {
    "executionInfo": {
     "elapsed": 3,
     "status": "ok",
     "timestamp": 1707468768660,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "90c45cd4"
   },
   "outputs": [],
   "source": [
    "print ('from train sample')\n",
    "test_scores = cls.predict_proba(x_test)\n",
    "\n",
    "train_scores = cls.predict_proba(x_train)\n",
    "#len(train_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d152dfd6",
   "metadata": {
    "executionInfo": {
     "elapsed": 6,
     "status": "ok",
     "timestamp": 1707468771185,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "d152dfd6"
   },
   "outputs": [],
   "source": [
    "# Get ROC\n",
    "fpr_tr, tpr_tr, tresholds_tr = roc_curve(y_train, train_scores[:,1],pos_label=None, sample_weight=w_train)\n",
    "fpr_te, tpr_te, tresholds_te = roc_curve(y_test, test_scores[:,1], pos_label=None, sample_weight=w_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebf05a4e",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 887,
     "status": "ok",
     "timestamp": 1707468780826,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "ebf05a4e",
    "outputId": "989b7fc3-6e36-4ac7-df48-2b2cfaa86b50"
   },
   "outputs": [],
   "source": [
    "train_auc = auc(fpr_tr, tpr_tr)\n",
    "test_auc = auc(fpr_te, tpr_te)\n",
    "train_auc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db4b578d",
   "metadata": {
    "executionInfo": {
     "elapsed": 397,
     "status": "ok",
     "timestamp": 1707468820612,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "db4b578d"
   },
   "outputs": [],
   "source": [
    "#ROC curve\n",
    "fig, ax = plt.subplots(figsize=(5,5))\n",
    "#ax.plot(tpr_tr, tpr_tr, lw=1, label='train (area - %0.3f)'%(train_auc))\n",
    "ax.plot(tpr_tr, 1-fpr_tr, lw=1, label='train (area - %0.3f)'%(train_auc))\n",
    "ax.plot(tpr_te, 1-fpr_te, lw=1, label='test (area - %0.3f)'%(test_auc))\n",
    "ax.grid()\n",
    "ax.legend(loc=\"lower right\")\n",
    "ax.set_ylim([0.0,1.0])\n",
    "ax.set_xlim([0.0,1.0])\n",
    "ax.set_xlabel('signal efficiency')\n",
    "ax.set_ylabel('background rejection')\n",
    "fig.savefig(\"roc_xgb.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e736a620",
   "metadata": {
    "executionInfo": {
     "elapsed": 1,
     "status": "ok",
     "timestamp": 1707468823214,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "e736a620"
   },
   "outputs": [],
   "source": [
    "def compare_train_test(clf, x_train, y_train, x_test, y_test, w_train, w_test, bins=40):\n",
    "    decisions = []\n",
    "    wts = []\n",
    "    for x,y,w in ((x_train, y_train, w_train),(x_test, y_test, w_test)):\n",
    "\n",
    "        d1 = clf.predict_proba(x[y>=0.5])[:, 1]\n",
    "        d2 = clf.predict_proba(x[y<0.5])[:, 1]\n",
    "        w1 = w[y>=0.5]\n",
    "        w2 = w[y<0.5]\n",
    "        decisions += [d1, d2]\n",
    "        wts += [w1, w2]\n",
    "    low = min(np.min(d) for d in decisions)\n",
    "    high = max(np.max(d) for d in decisions)\n",
    "    low_high = (low,high)\n",
    "    plt.figure(figsize=(5, 5), dpi=100)\n",
    "    plt.hist(decisions[0], weights= wts[0],\n",
    "             color='r', alpha=0.7, range=(0,1), bins=bins,\n",
    "             histtype='step', density=True,\n",
    "             label='S (train)')\n",
    "    plt.hist(decisions[1], weights=wts[1],\n",
    "             color='b', alpha=0.7, range=low_high, bins=bins,\n",
    "             histtype='step', density=True,\n",
    "             label='B (train)')\n",
    "\n",
    "    hist, bins = np.histogram(decisions[2], weights=wts[2],\n",
    "                              bins=bins, range=(0,1), density=1)\n",
    "    scale = len(decisions[2]) / sum(hist)\n",
    "    err = np.sqrt(hist * scale) / scale\n",
    "\n",
    "    width = (bins[1] - bins[0])\n",
    "    center = (bins[:-1] + bins[1:]) / 2\n",
    "    plt.errorbar(center, hist, yerr=err, fmt='none', c='r', label='S (test)')\n",
    "\n",
    "    hist, bins = np.histogram(decisions[3], weights=wts[3],\n",
    "                              bins=bins, range=low_high, density=1)\n",
    "    scale = len(decisions[2]) / sum(hist)\n",
    "    err = np.sqrt(hist * scale) / scale\n",
    "\n",
    "    plt.errorbar(center, hist, yerr=err, fmt='none', c='b', label='B (test)')\n",
    "\n",
    "    plt.xlabel(\"XGB output\")\n",
    "    plt.ylabel(\"Normalized units\")\n",
    "    plt.legend(loc='best')\n",
    "    plt.savefig(\"XGBoutput.png\", bbox_inches=\"tight\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e95848a",
   "metadata": {
    "executionInfo": {
     "elapsed": 2,
     "status": "ok",
     "timestamp": 1707468826006,
     "user": {
      "displayName": "Arun Nayak",
      "userId": "13683245814116317713"
     },
     "user_tz": -330
    },
    "id": "1e95848a",
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "compare_train_test(cls,x_train,y_train,x_test,y_test,w_train,w_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "004a0e0c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "@webio": {
   "lastCommId": null,
   "lastKernelId": null
  },
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
