8 min read

Customer Loss--Xgboost

算法应用–携程客户流失预测

数据说明

数据集为携程云海数据比赛题目云海赛事,共51个特征(其中一个为label[0,1分类]),缺失较严重.

数据概述

library(tidyverse)
library(woeBinning)
library(rsample)
library(ggthemes)

# set theme
old <- theme_set(theme_fivethirtyeight() + theme(text = element_text(family = "Menlo")))

# 设置种子
set.seed(1111)

dat <- read_tsv("~/Documents/GitHub/customer_loss/data/yhxwfx_data/userlostprob_train.csv", na = c("NULL"))
dat %>% skimr::skim()
## Skim summary statistics
##  n obs: 689945 
##  n variables: 51 
## 
## Variable type: Date 
##  variable missing complete      n        min        max     median
##   arrival       0   689945 689945 2016-05-15 2016-06-20 2016-05-20
##         d       0   689945 689945 2016-05-15 2016-05-21 2016-05-18
##  n_unique
##        37
##         7
## 
## Variable type: integer 
##                          variable missing complete      n      mean
##                        cancelrate   11718   678227 689945   1051.6 
##                       commentnums   67916   622029 689945   1272.09
##                   commentnums_pre   91577   598368 689945   1415.16
##                                 h       0   689945 689945     14.46
##            historyvisit_7ordernum  607030    82915 689945      1.86
##        historyvisit_totalordernum  303420   386525 689945     11.71
##  historyvisit_visit_detailpagenum  382711   307234 689945     37.15
##                     iforderpv_24h       0   689945 689945      0.19
##                             label       0   689945 689945      0.27
##                     landhalfhours   28633   661312 689945      6.09
##                   lasthtlordergap  242114   447831 689945  1e+05   
##                         lastpvgap   97127   592818 689945  12049.41
##                          novoters   17027   672918 689945   1706.25
##                      novoters_pre   40989   648956 689945   1890.7 
##                 ordercanncelednum  242114   447831 689945    154.18
##                  ordernum_oneyear  242114   447831 689945     11.64
##                          sampleid       0   689945 689945 628540.21
##                               sid       0   689945 689945    153.7 
##                  visitnum_oneyear   97035   592910 689945  18551.85
##         sd    p0    p25   p50    p75    p100     hist
##    1509.07     1    137   503   1373   18930 ▇▁▁▁▁▁▁▁
##    2101.87     0    115   514   1670   34189 ▇▁▁▁▁▁▁▁
##    2329.42     0    137   592   1862   34189 ▇▁▁▁▁▁▁▁
##       6.3      0     11    15     20      23 ▃▁▂▅▆▇▆▇
##       2.1      1      1     1      2     106 ▇▁▁▁▁▁▁▁
##      17.25     1      2     6     14     711 ▇▁▁▁▁▁▁▁
##      73.4      1      6    18     44    6199 ▇▁▁▁▁▁▁▁
##       0.4      0      0     0      0       1 ▇▁▁▁▁▁▁▂
##       0.45     0      0     0      1       1 ▇▁▁▁▁▁▁▃
##      12.41     0      0     0      4      49 ▇▁▁▁▁▁▁▁
##  122784.31     0  14999 46890 138953  527026 ▇▂▁▁▁▁▁▁
##   25601.37     0    551  2848  10726  194386 ▇▁▁▁▁▁▁▁
##    2811.69     1    157   692   2196   45455 ▇▁▁▁▁▁▁▁
##    3116.12     1    187   783   2453   45436 ▇▁▁▁▁▁▁▁
##     398.46     0      0     2    153   13475 ▇▁▁▁▁▁▁▁
##      17.14     1      2     6     14     711 ▇▁▁▁▁▁▁▁
##  414681.5  24636 312320 6e+05 887460 2238426 ▇▇▇▇▁▁▁▁
##     277.81     0     17    62    180    9956 ▇▁▁▁▁▁▁▁
##  228860.31     1    471  1315   3141 9651192 ▇▁▁▁▁▁▁▁
## 
## Variable type: numeric 
##                  variable missing complete      n    mean       sd
##                  avgprice  232684   457261 689945  422.46  290.85 
##          businessrate_pre  206049   483896 689945    0.37    0.23 
##         businessrate_pre2   86985   602960 689945    0.37    0.22 
##            cancelrate_pre   36930   653015 689945    0.34    0.18 
##                cityorders   38682   651263 689945    2.25    3.54 
##                   cityuvs    7671   682274 689945   10.65   15.7  
##          commentnums_pre2   41488   648457 689945 1313.39 1719.51 
##        consuming_capacity  226108   463837 689945   39.15   23.24 
##                        cr  232049   457896 689945    1.14    0.2  
##                    cr_pre   29397   660548 689945    1.06    0.045
##             ctrip_profits  244758   445187 689945    4.21    9.31 
##     customer_value_profit  250822   439123 689945    3.04    6.63 
##         customereval_pre2   28633   661312 689945    3.05    1.23 
##        decisionhabit_user  304495   385450 689945    5.32   38.52 
##              delta_price1  252799   437146 689945   79.07  512.94 
##              delta_price2  252195   437750 689945   77.28  391.41 
##        deltaprice_pre2_t1  146765   543180 689945    3.28   48.81 
##             firstorder_bu  312952   376993 689945   11.7     2.75 
##  historyvisit_avghotelnum  302069   387876 689945    6.51   41.05 
##                   hotelcr     797   689148 689945    1.06    0.045
##                   hoteluv     797   689148 689945   95.09  169.98 
##               lowestprice    2014   687931 689945  318.81  575.78 
##           lowestprice_pre   30256   659689 689945  315.95  463.72 
##          lowestprice_pre2   29281   660664 689945  318.54  351.91 
##             novoters_pre2   32329   657616 689945 1787.2  2316.71 
##      ordercanceledprecent  242114   447831 689945    0.34    0.35 
##           price_sensitive  226108   463837 689945   24.65   26.69 
##                starprefer  225053   464892 689945   67.53   19.18 
##                    uv_pre   29397   660548 689945  107.85  186.73 
##                   uv_pre2   28756   661189 689945  103.35  157.12 
##          p0    p25     p50     p75     p100     hist
##       1     232     350     524     6383    ▇▁▁▁▁▁▁▁
##       0       0.15    0.39    0.57     0.99 ▇▅▅▆▆▅▁▁
##       0       0.17    0.4     0.55     0.99 ▇▅▆▇▇▅▁▁
##       0       0.23    0.32    0.42     1    ▂▆▇▆▂▁▁▁
##       0.007   0.13    0.63    2.75    14.51 ▇▂▁▁▁▁▁▁
##       0.007   0.83    3.53   13.33    67.14 ▇▂▁▁▁▁▁▁
##       0     270     768    1780    34189    ▇▁▁▁▁▁▁▁
##       0      22      33      51      100    ▂▇▇▅▃▂▁▂
##       1       1       1.05    1.21    11    ▇▁▁▁▁▁▁▁
##       1       1.03    1.06    1.09     2.95 ▇▁▁▁▁▁▁▁
##     -44.31    0.34    1.35    4.32   600.82 ▇▁▁▁▁▁▁▁
##     -24.07    0.27    0.99    3.14   598.06 ▇▁▁▁▁▁▁▁
##       0       2       3       4        6    ▁▃▆▇▆▇▃▁
##       0       2       3       5     3167    ▇▁▁▁▁▁▁▁
##  -99879     -31      81     226     5398    ▁▁▁▁▁▁▁▇
##  -43344     -29      69     198     5114    ▁▁▁▁▁▁▁▇
##   -2296      -3       2      10     3324    ▁▁▁▇▁▁▁▁
##       1      12      13      13       21    ▁▁▁▁▇▁▁▁
##       0       2       4       7     3167    ▇▁▁▁▁▁▁▁
##       1       1.03    1.05    1.09     3.18 ▇▁▁▁▁▁▁▁
##       0.007  10.43   36.18  107.75  1722.61 ▇▁▁▁▁▁▁▁
##      -3     116     200     380    1e+05    ▇▁▁▁▁▁▁▁
##       1     118     208     385    1e+05    ▇▁▁▁▁▁▁▁
##       1     145     233     388    43700    ▇▁▁▁▁▁▁▁
##       1     391    1054    2413    45436    ▇▁▁▁▁▁▁▁
##       0       0       0.25    0.57     1    ▇▂▂▂▁▁▁▃
##       0       5      16      33      100    ▇▅▂▂▁▁▁▁
##       0      53.3    69.4    80.3    100    ▁▁▁▆▆▆▇▅
##       0.007  12.53   42.5   124.71  1722.61 ▇▁▁▁▁▁▁▁
##       0.007  17.56   51.29  126.2   1722.61 ▇▁▁▁▁▁▁▁

可以看到,数据缺失情况比较严重.

观察输出猜想部分缺失数据有一定的关系(lasthtlordergap,ordercanncelednum等缺失数量相同).验证猜想:

c1 <- (dat %>% filter(is.na(lasthtlordergap)) %>% pull(sampleid))
c2 <- (dat %>% filter(is.na(ordercanncelednum)) %>% pull(sampleid)) 
c3 <- (dat %>% filter(is.na(ordernum_oneyear)) %>% pull(sampleid)) 
c4 <- (dat %>% filter(is.na(ordercanceledprecent)) %>% pull(sampleid)) 
(!(c1 == c2)) %>% sum()
## [1] 0
(!(c1 == c3)) %>% sum()
## [1] 0
(!(c1 == c4)) %>% sum()
## [1] 0

发现这四个变量确实是同时缺失.

查看日期及因变量分布

dat %>% 
    ggplot(aes(arrival)) + 
    geom_bar(fill = "pink2")

dat %>% 
    ggplot(aes(d)) + 
    geom_bar(fill = "blue")

dat %>% 
    ggplot(aes(as.factor(label))) + 
    geom_bar()

酒店行业一般关注周末和节日,但从图中可以看出入住日期在5月20日5月21日明显高于其他日期,周末无明显增加.
预定日期则分布较平均.

特征工程

  • 缺失值处理
    分析字段含义及缺失情况确定缺失值处理方式:
    1.缺失率大于80% — 删除
    2.中位数填充
    3.0值填充
  • 异常值处理
    分析字段含义及极端情况确定极端值处理方式: 盖帽法等
  • 删除相关性大于0.8字段
  • 衍生变量: arrive - d(入住日期-预定日期)

建模

# 载入包 --------------------------------------------------------------------
library(xgboost)
library(pROC)
# 载入数据 --------------------------------------------------------------------

load(file = "~/Documents/GitHub/customer_loss/data/df_train.RDs")
load(file = "~/Documents/GitHub/customer_loss/data/df_test.RDs")

# 设置种子 --------------------------------------------------------------------

set.seed(6666)

建模(暂未调整参数),导入已经保存模型

load(file = "~/Documents/GitHub/customer_loss/data/xgb_md.model")

计算不同阈值准确率,召回率,精确度函数

# 计算不同阈值准确率,召回率,精确度函数-----------------------------

get_eval <- function(thr, dat, mdl) {
    
    pred <- predict(mdl, newdata = as.matrix(dat[, -1]))
    
    prediction <- as.numeric(pred > thr)
    
    accuracy <- (table(prediction, dat$label)[1, 1] + 
        table(prediction, dat$label)[2, 2]) / (dat$label %>% length())
    
    recall <- table(prediction, dat$label)[1, 1] / table(dat$label)[1]
    
    precision <- table(prediction, dat$label)[1, 1] / table(prediction)[1]
    
    list(thr = thr, recall = recall, precision = precision, accuracy = accuracy)
}

预测结果

pred_train <- predict(xgb_md2, newdata = as.matrix(df_train[, -1]))

prediction_train <- as.numeric(pred_train > 0.5)

table(prediction_train, df_train$label)
##                 
## prediction_train      0      1
##                0 355571  99067
##                1  19808  43013
pred_test <- predict(xgb_md2, newdata = as.matrix(df_test[, -1]))

prediction_test <- as.numeric(pred_test > 0.5)

table(prediction_test, df_test$label)
##                
## prediction_test      0      1
##               0 118350  33290
##               1   6859  13987
# 阈值范围 --------------------------------------------------------------------

thr_ls <- seq(0.05, 0.6, by = 0.05)

result_train <- thr_ls %>% map(get_eval, dat = df_train, mdl = xgb_md2) %>% bind_rows()

result_test <- thr_ls %>% map(get_eval, dat = df_test, mdl = xgb_md2) %>% bind_rows()

不同阈值下recall,precision

result_train
## # A tibble: 12 x 4
##      thr recall precision accuracy
##    <dbl>  <dbl>     <dbl>    <dbl>
##  1  0.05 0.0692     0.992    0.324
##  2  0.1  0.210      0.970    0.422
##  3  0.15 0.366      0.942    0.524
##  4  0.2  0.510      0.913    0.609
##  5  0.25 0.632      0.887    0.674
##  6  0.3  0.731      0.862    0.720
##  7  0.35 0.811      0.838    0.749
##  8  0.4  0.874      0.817    0.766
##  9  0.45 0.917      0.798    0.771
## 10  0.5  0.947      0.782    0.770
## 11  0.55 0.968      0.768    0.764
## 12  0.6  0.982      0.756    0.757
result_test
## # A tibble: 12 x 4
##      thr recall precision accuracy
##    <dbl>  <dbl>     <dbl>    <dbl>
##  1  0.05 0.0660     0.989    0.321
##  2  0.1  0.206      0.962    0.418
##  3  0.15 0.362      0.934    0.518
##  4  0.2  0.506      0.908    0.604
##  5  0.25 0.630      0.882    0.670
##  6  0.3  0.729      0.858    0.716
##  7  0.35 0.809      0.835    0.745
##  8  0.4  0.871      0.813    0.762
##  9  0.45 0.915      0.796    0.768
## 10  0.5  0.945      0.780    0.767
## 11  0.55 0.966      0.767    0.762
## 12  0.6  0.980      0.755    0.754

ROC & AUC

# ROC & AUC----------------------------------------------------------
plot(roc(df_train$label, pred_train), col="blue", ylab = "train_sensitivity")
plot.roc(roc(df_train$label, pred_train), 
         add = T, 
         col = "red", 
         ylab = "train_sensitivity")

auc(roc(df_train$label, pred_train))
## Area under the curve: 0.7893
plot(roc(df_test$label, pred_test), col="blue", ylab = "test_sensitivity")  
plot.roc(roc(df_test$label, pred_test), 
         add = T,
         col = "red",
         ylab = "test_sensitivity")

auc(roc(df_test$label, pred_test))
## Area under the curve: 0.7801

变量重要性

# 对模型影响最大的变量排序
xgb.importance(colnames(df_train[,-1]), model = xgb_md2)
##                              Feature        Gain       Cover   Frequency
##  1:                               cr 0.153651789 0.047566831 0.041361078
##  2:                        intervals 0.118783815 0.060143559 0.040508272
##  3:                                h 0.094718212 0.085934130 0.057052703
##  4:                 visitnum_oneyear 0.064642718 0.071597258 0.077093638
##  5:                    iforderpv_24h 0.058117165 0.029205510 0.019699812
##  6:                 ordernum_oneyear 0.055669929 0.028744532 0.032577179
##  7:                  lasthtlordergap 0.041025620 0.061993323 0.066263005
##  8:                              sid 0.036329083 0.047257526 0.057820229
##  9:                     delta_price2 0.033102337 0.017224131 0.012621525
## 10:                       cityorders 0.032261176 0.030432655 0.038717380
## 11:                        lastpvgap 0.030984258 0.058553044 0.064557394
## 12:                          hotelcr 0.027527373 0.028615633 0.007931093
## 13:                businessrate_pre2 0.027464538 0.023832944 0.027375064
## 14:                    ctrip_profits 0.026438912 0.055299986 0.057734948
## 15:                    landhalfhours 0.020100372 0.026046214 0.020040935
## 16:                       starprefer 0.017563156 0.021307767 0.039484905
## 17:                     novoters_pre 0.017305605 0.024518610 0.017482517
## 18:         historyvisit_avghotelnum 0.016890755 0.036504439 0.034453352
## 19: historyvisit_visit_detailpagenum 0.016439137 0.019400069 0.035561999
## 20:                         avgprice 0.014344709 0.028634704 0.039143783
## 21:                           cr_pre 0.014202642 0.012878769 0.010063108
## 22:                  price_sensitive 0.013254857 0.013732056 0.034282790
## 23:                          uv_pre2 0.009866184 0.023878428 0.023025755
## 24:                customereval_pre2 0.009512793 0.023686367 0.014241856
## 25:                ordercanncelednum 0.007786983 0.007395971 0.019699812
## 26:                 lowestprice_pre2 0.007647376 0.019632453 0.018250043
## 27:             ordercanceledprecent 0.006708844 0.006725483 0.018420604
## 28:                    firstorder_bu 0.005541326 0.014781626 0.015691625
## 29:                      lowestprice 0.005027691 0.019493050 0.009551424
## 30:                  lowestprice_pre 0.004798664 0.013786701 0.013900733
## 31:                   cancelrate_pre 0.004406408 0.009735586 0.011768719
## 32:               deltaprice_pre2_t1 0.003458906 0.021131246 0.013559611
## 33:                       cancelrate 0.002800655 0.007273567 0.004775712
## 34:                          hoteluv 0.001626012 0.003055834 0.005287396
##                              Feature        Gain       Cover   Frequency