### TabNet

<div class="alert alert-block alert-info">
    This is a pyTorch implementation of Tabnet (Arik, S. O., & Pfister, T. 2019) by <a href="https://github.com/dreamquark-ai)">DreamQuark</a>
</div>

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetClassifier

np.random.seed(0)

from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
# Generate a synthetic dataset
X, y = make_classification(
    n_samples=50000, n_features=15, n_classes=2, n_informative=5, random_state=42
)

X, y = pd.DataFrame(X), pd.DataFrame(y)

X.columns = [f"feature_{i+1}" for i in range(len(X.columns))]

ix_train, ix_test = train_test_split(X.index, stratify=y, random_state=62)

In [3]:
class GiniScore(Metric):
    def __init__(self):
        self._name = "gini"
        self._maximize = True

    def __call__(self, y_true, y_score):
        auc = roc_auc_score(y_true, y_score[:, 1])
        return max(2 * auc - 1, 0.0)

In [4]:
max_epochs = 100
batch_size = round(0.10 * len(X.loc[ix_train]))
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
    scheduler_params={
        "is_batch_level": True,
        "max_lr": 5e-2,
        "steps_per_epoch": int(X.loc[ix_train].shape[0] / batch_size) + 1,
        "epochs": max_epochs,
    },
    mask_type="entmax",
)



In [6]:
clf.fit(
    X_train=X.loc[ix_train].values,
    y_train=y.loc[ix_train].values.ravel(),
    eval_set=[
        (X.loc[ix_train].values, y.loc[ix_train].values.ravel()),
        (X.loc[ix_test].values, y.loc[ix_test].values.ravel()),
    ],
    eval_name=["train", "test"],
    eval_metric=["logloss", GiniScore],
    max_epochs=max_epochs,
    patience=5,
    batch_size=batch_size,
    virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
    loss_fn=nn.CrossEntropyLoss(),
)

# Make predictions using the model with two tree iterations
predictions_trn = clf.predict_proba(X.loc[ix_train].to_numpy())[:, 1]
predictions_tst = clf.predict_proba(X.loc[ix_test].to_numpy())[:, 1]

gini_trn = roc_auc_score(y.loc[ix_train], predictions_trn) * 2 - 1
gini_tst = roc_auc_score(y.loc[ix_test], predictions_tst) * 2 - 1

print(f"Train Gini score: {gini_trn:.2%}" f"Test Gini score: {gini_tst:.2%}")

epoch 0  | loss: 0.72843 | train_logloss: 0.69951 | train_gini: 0.36982 | test_logloss: 0.69944 | test_gini: 0.37703 |  0:00:03s
epoch 1  | loss: 0.60558 | train_logloss: 0.57118 | train_gini: 0.55522 | test_logloss: 0.5682  | test_gini: 0.56746 |  0:00:06s
epoch 2  | loss: 0.54865 | train_logloss: 0.50776 | train_gini: 0.65734 | test_logloss: 0.50583 | test_gini: 0.66613 |  0:00:09s
epoch 3  | loss: 0.50686 | train_logloss: 0.46785 | train_gini: 0.71664 | test_logloss: 0.46232 | test_gini: 0.72472 |  0:00:12s
epoch 4  | loss: 0.47736 | train_logloss: 0.43841 | train_gini: 0.75475 | test_logloss: 0.43204 | test_gini: 0.76403 |  0:00:15s
epoch 5  | loss: 0.44659 | train_logloss: 0.40937 | train_gini: 0.78939 | test_logloss: 0.40182 | test_gini: 0.79961 |  0:00:18s
epoch 6  | loss: 0.419   | train_logloss: 0.3838  | train_gini: 0.81762 | test_logloss: 0.37717 | test_gini: 0.82615 |  0:00:22s
epoch 7  | loss: 0.39237 | train_logloss: 0.36304 | train_gini: 0.83837 | test_logloss: 0.3584  |



Train Gini score: 96.45%
Test Gini score: 95.70%
