2 min read

Logistic Regression & Stepwise Selection

算法应用–携程客户流失预测

此次逻辑回归应用的数据依然为携程客户流失数据

library(tidyverse)
library(MASS)
library(pROC)

# 设置种子
set.seed(1111)

# 载入数据
load("~/Documents/GitHub/customer_loss/data/df_train.RDs")
load("~/Documents/GitHub/customer_loss/data/df_test.RDs")

建模过程

为方便载入建好的模型

# 载入模型
load("~/Documents/GitHub/customer_loss/data/lr_aic_back.model")

# 训练集预测概率
pred_train_LR_AIC <- predict(lr_aic_back, df_train, type = "response")

# 测试集预测概率
pred_test_LR_AIC <- predict(lr_aic_back, df_test, type = "response")

# 训练集AUC值
auc(roc(df_train$label, pred_train_LR_AIC))
## Area under the curve: 0.7007
# 绘制ROC曲线(训练集)
plot(roc(df_train$label, pred_train_LR_AIC), 
     col="blue", 
     ylab = "train_sensitivity")

# 测试集AUC值
auc(roc(df_test$label, pred_test_LR_AIC))
## Area under the curve: 0.7016
# 绘制ROC曲线(测试集)
plot(roc(df_test$label, pred_test_LR_AIC), 
     col="blue", 
     ylab = "test_sensitivity") 

为对比变量重要性,将数据标准化后重新建模

df_train_scale <- scale(df_train[, -1]) %>% cbind(df_train[, "label"])

载入标准化后建好的模型

# 载入模型
load("~/Documents/GitHub/customer_loss/data/lr_scale_aic_back.model")

# 变量重要性
name_scale_aic_back <- names(coef(lr_scale_aic_back))
coef_scale_aic_back <- coef(lr_scale_aic_back)
names(coef_scale_aic_back) <- NULL

coef_scale_aic_back <- data.frame(
    vars = name_scale_aic_back,
    beta = coef_scale_aic_back
    ) %>% 
    mutate(abs_coef = abs(beta)) %>% 
    arrange(-abs_coef)

coef_scale_aic_back
##                                vars         beta    abs_coef
## 1                       (Intercept) -1.090877469 1.090877469
## 2                         intervals -0.303154495 0.303154495
## 3                  ordernum_oneyear  0.253100544 0.253100544
## 4                     iforderpv_24h  0.243717294 0.243717294
## 5                  visitnum_oneyear -0.239545030 0.239545030
## 6                                cr  0.235518604 0.235518604
## 7                                 h -0.185350887 0.185350887
## 8                        cityorders  0.089755866 0.089755866
## 9                        cancelrate  0.088965702 0.088965702
## 10                      lowestprice -0.086693042 0.086693042
## 11                     delta_price2  0.084078964 0.084078964
## 12                          hotelcr  0.081661081 0.081661081
## 13                          hoteluv -0.076884207 0.076884207
## 14                              sid -0.064459424 0.064459424
## 15                           cr_pre  0.063210075 0.063210075
## 16                businessrate_pre2  0.063171709 0.063171709
## 17                         avgprice -0.059925933 0.059925933
## 18                        lastpvgap  0.055242920 0.055242920
## 19                 lowestprice_pre2 -0.052549346 0.052549346
## 20                    ctrip_profits  0.051192872 0.051192872
## 21                     novoters_pre  0.043840721 0.043840721
## 22                          uv_pre2 -0.041661459 0.041661459
## 23         historyvisit_avghotelnum -0.041369649 0.041369649
## 24                  price_sensitive  0.036234687 0.036234687
## 25                    landhalfhours  0.034746513 0.034746513
## 26               deltaprice_pre2_t1 -0.021384723 0.021384723
## 27                       starprefer -0.013198428 0.013198428
## 28             ordercanceledprecent  0.011950176 0.011950176
## 29 historyvisit_visit_detailpagenum  0.010735176 0.010735176
## 30                ordercanncelednum -0.009666662 0.009666662