梯度提升库 shap 值计算速度比较
在这里,我们比较 CatBoost、LightGBM 和 XGBoost 在 shap 值计算方面的速度。所有 boosting 算法都在 GPU 上训练,但 shap 评估在 CPU 上进行。
我们使用来自 这里 的 epsilon_normalized 数据集。
[1]:
import copy
import datetime
import os
import catboost
import lightgbm as lgb
import numpy as np
import pandas as pd
import tqdm
import xgboost as xgb
from sklearn import datasets
[2]:
catboost.__version__, lgb.__version__, xgb.__version__
[2]:
('0.11.2', '2.2.2', '0.81')
[3]:
train_data, train_target = datasets.load_svmlight_file("epsilon_normalized")
test_data, test_target = datasets.load_svmlight_file(
    "epsilon_normalized.t",
)
参数
[4]:
num_iters = 1000
lr = 0.1
max_bin = 128
gpu_device = "0"  # specify your GPU (used only for training)
random_state = 0
[5]:
train_target[train_target == -1] = 0
test_target[test_target == -1] = 0
[6]:
def preprocess_data(data, label=None, mode="train", boosting=None):
    assert boosting is not None
    if boosting == "xgboost":
        return xgb.DMatrix(data, label)
    elif boosting == "lightgbm":
        if mode == "train":
            return lgb.Dataset(data, label)
        else:
            return data
    elif boosting == "catboost":
        data = catboost.FeaturesData(num_feature_data=data)
        return catboost.Pool(data, label)
    else:
        raise RuntimeError("Unknown boosting library")
[7]:
def create_parameters(base_params, boosting=None, **kwargs):
    assert boosting is not None
    assert isinstance(base_params, dict)
    params = copy.copy(base_params)
    if boosting == "xgboost":
        params["objective"] = "binary:logistic"
        params["max_depth"] = kwargs["depth"]
        params["tree_method"] = "gpu_hist"
        params["gpu_id"] = gpu_device
    elif boosting == "lightgbm":
        params["objective"] = "binary"
        params["device"] = "gpu"
        params["gpu_device_id"] = gpu_device
        params["num_leaves"] = 2 ** kwargs["depth"]
    elif boosting == "catboost":
        params["objective"] = "Logloss"
        params["task_type"] = "GPU"
        params["devices"] = gpu_device
        params["bootstrap_type"] = "Bernoulli"
        params["logging_level"] = "Silent"
    else:
        raise RuntimeError("Unknown boosting library")
    return params
[8]:
def train(data, params, num_iters, boosting=None):
    assert boosting is not None
    if boosting == "xgboost":
        return xgb.train(params=params, dtrain=data, num_boost_round=num_iters)
    elif boosting == "lightgbm":
        return lgb.train(params=params, train_set=data, num_boost_round=num_iters)
    elif boosting == "catboost":
        return catboost.train(pool=data, params=params, num_boost_round=num_iters)
    else:
        raise RuntimeError("Unknown boosting library")
[9]:
def predict_shap(model, data, boosting=None):
    assert boosting is not None
    if boosting == "xgboost":
        return model.predict(data, pred_contribs=True)
    elif boosting == "lightgbm":
        return model.predict(data, pred_contrib=True)
    elif boosting == "catboost":
        return model.get_feature_importance(data, fstr_type="ShapValues")
[10]:
def create_path(boosting, params):
    fname = [boosting]
    for key, value in sorted(params.items()):
        fname.append(str(key))
        fname.append(str(value))
    fname = "_".join(fname)
    fname = fname.replace(".", "")
    fname += ".model"
    return fname
[11]:
def load_model(fname, boosting):
    if boosting == "xgboost":
        bst = xgb.Booster(model_file=fname)
        bst.load_model(fname)
    elif boosting == "lightgbm":
        bst = lgb.Booster(model_file=fname)
    elif boosting == "catboost":
        bst = catboost.CatBoost()
        bst.load_model(fname)
    else:
        raise RuntimeError("Unknown boosting")
    return bst
[12]:
base_params = {"learning_rate": lr, "max_bin": max_bin, "random_state": random_state}
[13]:
result = []
boosting_list = ["xgboost", "catboost", "lightgbm"]
depth_list = [2, 4, 6, 8, 10]
lens_list = [1000, 5000, 10000]
for gb_type in boosting_list:
    print(f"{gb_type} is going")
    for size_test in lens_list:
        print(f"size test {size_test}")
        sep_test_data = test_data[:size_test]
        sep_test_target = test_target[:size_test]
        # comment this line if you have already trained all models
        train_preprocessed = preprocess_data(train_data, train_target, boosting=gb_type)
        dense_test = sep_test_data.todense().A.astype(np.float32)
        for depth in tqdm.tqdm(depth_list):
            start_test_preproc = datetime.datetime.now()
            test_preprocessed = preprocess_data(dense_test, sep_test_target, mode="test", boosting=gb_type)
            finish_test_preproc = datetime.datetime.now()
            preprocessing_delta = finish_test_preproc - start_test_preproc
            preprocessing_delta = preprocessing_delta.total_seconds()
            params = create_parameters(base_params, boosting=gb_type, depth=depth)
            params["depth"] = depth
            fname = create_path(gb_type, params)
            if os.path.exists(fname):
                print("model exist")
                bst = load_model(fname, boosting=gb_type)
            else:
                print("model is training")
                start_train = datetime.datetime.now()
                bst = train(train_preprocessed, params, num_iters=num_iters, boosting=gb_type)
                finish_train = datetime.datetime.now()
                delta_train = finish_train - start_train
                delta_train = int(delta_train.total_seconds() * 1000)
                bst.save_model(fname)
            start_time = datetime.datetime.now()
            preds = predict_shap(bst, test_preprocessed, boosting=gb_type)
            assert preds.shape == (sep_test_data.shape[0], sep_test_data.shape[1] + 1)
            finish_time = datetime.datetime.now()
            delta = finish_time - start_time
            delta = delta.total_seconds()
            current_res = {
                "preprocessing_time": preprocessing_delta,
                "boosting": gb_type,
                "test_size": size_test,
                "depth": depth,
                "time": delta,
            }
            result.append(current_res)
        print("*" * 40)
[14]:
result_df = pd.DataFrame(result)
[16]:
result_df.to_csv(f"shap_benchmark_{max_bin}_max_bin_with_test_sizes.csv", index=False)
[17]:
result_df = pd.read_csv(
    "shap_benchmark_128_max_bin_with_test_sizes.csv",
)
result_df.pivot_table(index=["test_size", "depth"], columns="boosting", values="time")
[17]:
| boosting | catboost | lightgbm | xgboost | |
|---|---|---|---|---|
| test_size | depth | |||
| 1000 | 2 | 0.311027 | 0.090156 | 0.112515 | 
| 4 | 0.281931 | 0.578531 | 0.300671 | |
| 6 | 0.464603 | 4.159926 | 1.468442 | |
| 8 | 4.918599 | 23.844245 | 7.847191 | |
| 10 | 93.152000 | 119.527824 | 30.872254 | |
| 5000 | 2 | 1.171963 | 0.284673 | 0.241316 | 
| 4 | 1.081119 | 2.094985 | 0.931881 | |
| 6 | 1.319114 | 20.624486 | 6.498283 | |
| 8 | 5.807985 | 118.552238 | 38.992395 | |
| 10 | 95.049909 | 601.251603 | 153.408904 | |
| 10000 | 2 | 2.048301 | 0.621454 | 0.509722 | 
| 4 | 2.263058 | 4.291201 | 1.935541 | |
| 6 | 2.396371 | 42.788038 | 12.981580 | |
| 8 | 7.078056 | 240.614644 | 77.883250 | |
| 10 | 95.680684 | 1189.685032 | 306.529277 | 
[18]:
result_df.pivot_table(index="test_size", columns="boosting", values="preprocessing_time")
[18]:
| boosting | catboost | lightgbm | xgboost | 
|---|---|---|---|
| test_size | |||
| 1000 | 0.069569 | 0.002816 | 0.011025 | 
| 5000 | 0.349831 | 0.000006 | 0.047836 | 
| 10000 | 0.770179 | 0.000006 | 0.089032 | 
[ ]: