算法应用–携程客户流失预测
数据说明
数据集为携程云海数据比赛题目云海赛事,共51个特征(其中一个为label[0,1分类]),缺失较严重.
数据概述
library(tidyverse)
library(woeBinning)
library(rsample)
library(ggthemes)
# set theme
old <- theme_set(theme_fivethirtyeight() + theme(text = element_text(family = "Menlo")))
# 设置种子
set.seed(1111)
dat <- read_tsv("~/Documents/GitHub/customer_loss/data/yhxwfx_data/userlostprob_train.csv", na = c("NULL"))
dat %>% skimr::skim()
## Skim summary statistics
## n obs: 689945
## n variables: 51
##
## Variable type: Date
## variable missing complete n min max median
## arrival 0 689945 689945 2016-05-15 2016-06-20 2016-05-20
## d 0 689945 689945 2016-05-15 2016-05-21 2016-05-18
## n_unique
## 37
## 7
##
## Variable type: integer
## variable missing complete n mean
## cancelrate 11718 678227 689945 1051.6
## commentnums 67916 622029 689945 1272.09
## commentnums_pre 91577 598368 689945 1415.16
## h 0 689945 689945 14.46
## historyvisit_7ordernum 607030 82915 689945 1.86
## historyvisit_totalordernum 303420 386525 689945 11.71
## historyvisit_visit_detailpagenum 382711 307234 689945 37.15
## iforderpv_24h 0 689945 689945 0.19
## label 0 689945 689945 0.27
## landhalfhours 28633 661312 689945 6.09
## lasthtlordergap 242114 447831 689945 1e+05
## lastpvgap 97127 592818 689945 12049.41
## novoters 17027 672918 689945 1706.25
## novoters_pre 40989 648956 689945 1890.7
## ordercanncelednum 242114 447831 689945 154.18
## ordernum_oneyear 242114 447831 689945 11.64
## sampleid 0 689945 689945 628540.21
## sid 0 689945 689945 153.7
## visitnum_oneyear 97035 592910 689945 18551.85
## sd p0 p25 p50 p75 p100 hist
## 1509.07 1 137 503 1373 18930 ▇▁▁▁▁▁▁▁
## 2101.87 0 115 514 1670 34189 ▇▁▁▁▁▁▁▁
## 2329.42 0 137 592 1862 34189 ▇▁▁▁▁▁▁▁
## 6.3 0 11 15 20 23 ▃▁▂▅▆▇▆▇
## 2.1 1 1 1 2 106 ▇▁▁▁▁▁▁▁
## 17.25 1 2 6 14 711 ▇▁▁▁▁▁▁▁
## 73.4 1 6 18 44 6199 ▇▁▁▁▁▁▁▁
## 0.4 0 0 0 0 1 ▇▁▁▁▁▁▁▂
## 0.45 0 0 0 1 1 ▇▁▁▁▁▁▁▃
## 12.41 0 0 0 4 49 ▇▁▁▁▁▁▁▁
## 122784.31 0 14999 46890 138953 527026 ▇▂▁▁▁▁▁▁
## 25601.37 0 551 2848 10726 194386 ▇▁▁▁▁▁▁▁
## 2811.69 1 157 692 2196 45455 ▇▁▁▁▁▁▁▁
## 3116.12 1 187 783 2453 45436 ▇▁▁▁▁▁▁▁
## 398.46 0 0 2 153 13475 ▇▁▁▁▁▁▁▁
## 17.14 1 2 6 14 711 ▇▁▁▁▁▁▁▁
## 414681.5 24636 312320 6e+05 887460 2238426 ▇▇▇▇▁▁▁▁
## 277.81 0 17 62 180 9956 ▇▁▁▁▁▁▁▁
## 228860.31 1 471 1315 3141 9651192 ▇▁▁▁▁▁▁▁
##
## Variable type: numeric
## variable missing complete n mean sd
## avgprice 232684 457261 689945 422.46 290.85
## businessrate_pre 206049 483896 689945 0.37 0.23
## businessrate_pre2 86985 602960 689945 0.37 0.22
## cancelrate_pre 36930 653015 689945 0.34 0.18
## cityorders 38682 651263 689945 2.25 3.54
## cityuvs 7671 682274 689945 10.65 15.7
## commentnums_pre2 41488 648457 689945 1313.39 1719.51
## consuming_capacity 226108 463837 689945 39.15 23.24
## cr 232049 457896 689945 1.14 0.2
## cr_pre 29397 660548 689945 1.06 0.045
## ctrip_profits 244758 445187 689945 4.21 9.31
## customer_value_profit 250822 439123 689945 3.04 6.63
## customereval_pre2 28633 661312 689945 3.05 1.23
## decisionhabit_user 304495 385450 689945 5.32 38.52
## delta_price1 252799 437146 689945 79.07 512.94
## delta_price2 252195 437750 689945 77.28 391.41
## deltaprice_pre2_t1 146765 543180 689945 3.28 48.81
## firstorder_bu 312952 376993 689945 11.7 2.75
## historyvisit_avghotelnum 302069 387876 689945 6.51 41.05
## hotelcr 797 689148 689945 1.06 0.045
## hoteluv 797 689148 689945 95.09 169.98
## lowestprice 2014 687931 689945 318.81 575.78
## lowestprice_pre 30256 659689 689945 315.95 463.72
## lowestprice_pre2 29281 660664 689945 318.54 351.91
## novoters_pre2 32329 657616 689945 1787.2 2316.71
## ordercanceledprecent 242114 447831 689945 0.34 0.35
## price_sensitive 226108 463837 689945 24.65 26.69
## starprefer 225053 464892 689945 67.53 19.18
## uv_pre 29397 660548 689945 107.85 186.73
## uv_pre2 28756 661189 689945 103.35 157.12
## p0 p25 p50 p75 p100 hist
## 1 232 350 524 6383 ▇▁▁▁▁▁▁▁
## 0 0.15 0.39 0.57 0.99 ▇▅▅▆▆▅▁▁
## 0 0.17 0.4 0.55 0.99 ▇▅▆▇▇▅▁▁
## 0 0.23 0.32 0.42 1 ▂▆▇▆▂▁▁▁
## 0.007 0.13 0.63 2.75 14.51 ▇▂▁▁▁▁▁▁
## 0.007 0.83 3.53 13.33 67.14 ▇▂▁▁▁▁▁▁
## 0 270 768 1780 34189 ▇▁▁▁▁▁▁▁
## 0 22 33 51 100 ▂▇▇▅▃▂▁▂
## 1 1 1.05 1.21 11 ▇▁▁▁▁▁▁▁
## 1 1.03 1.06 1.09 2.95 ▇▁▁▁▁▁▁▁
## -44.31 0.34 1.35 4.32 600.82 ▇▁▁▁▁▁▁▁
## -24.07 0.27 0.99 3.14 598.06 ▇▁▁▁▁▁▁▁
## 0 2 3 4 6 ▁▃▆▇▆▇▃▁
## 0 2 3 5 3167 ▇▁▁▁▁▁▁▁
## -99879 -31 81 226 5398 ▁▁▁▁▁▁▁▇
## -43344 -29 69 198 5114 ▁▁▁▁▁▁▁▇
## -2296 -3 2 10 3324 ▁▁▁▇▁▁▁▁
## 1 12 13 13 21 ▁▁▁▁▇▁▁▁
## 0 2 4 7 3167 ▇▁▁▁▁▁▁▁
## 1 1.03 1.05 1.09 3.18 ▇▁▁▁▁▁▁▁
## 0.007 10.43 36.18 107.75 1722.61 ▇▁▁▁▁▁▁▁
## -3 116 200 380 1e+05 ▇▁▁▁▁▁▁▁
## 1 118 208 385 1e+05 ▇▁▁▁▁▁▁▁
## 1 145 233 388 43700 ▇▁▁▁▁▁▁▁
## 1 391 1054 2413 45436 ▇▁▁▁▁▁▁▁
## 0 0 0.25 0.57 1 ▇▂▂▂▁▁▁▃
## 0 5 16 33 100 ▇▅▂▂▁▁▁▁
## 0 53.3 69.4 80.3 100 ▁▁▁▆▆▆▇▅
## 0.007 12.53 42.5 124.71 1722.61 ▇▁▁▁▁▁▁▁
## 0.007 17.56 51.29 126.2 1722.61 ▇▁▁▁▁▁▁▁
可以看到,数据缺失情况比较严重.
观察输出猜想部分缺失数据有一定的关系(lasthtlordergap,ordercanncelednum等缺失数量相同).验证猜想:
c1 <- (dat %>% filter(is.na(lasthtlordergap)) %>% pull(sampleid))
c2 <- (dat %>% filter(is.na(ordercanncelednum)) %>% pull(sampleid))
c3 <- (dat %>% filter(is.na(ordernum_oneyear)) %>% pull(sampleid))
c4 <- (dat %>% filter(is.na(ordercanceledprecent)) %>% pull(sampleid))
(!(c1 == c2)) %>% sum()
## [1] 0
(!(c1 == c3)) %>% sum()
## [1] 0
(!(c1 == c4)) %>% sum()
## [1] 0
发现这四个变量确实是同时缺失.
查看日期及因变量分布
dat %>%
ggplot(aes(arrival)) +
geom_bar(fill = "pink2")
dat %>%
ggplot(aes(d)) +
geom_bar(fill = "blue")
dat %>%
ggplot(aes(as.factor(label))) +
geom_bar()
酒店行业一般关注周末和节日,但从图中可以看出入住日期在5月20日
和5月21日
明显高于其他日期,周末无明显增加.
预定日期则分布较平均.
特征工程
- 缺失值处理
分析字段含义及缺失情况确定缺失值处理方式:
1.缺失率大于80% — 删除
2.中位数填充
3.0值填充 - 异常值处理
分析字段含义及极端情况确定极端值处理方式: 盖帽法等
- 删除相关性大于0.8字段
- 衍生变量: arrive - d(入住日期-预定日期)
建模
# 载入包 --------------------------------------------------------------------
library(xgboost)
library(pROC)
# 载入数据 --------------------------------------------------------------------
load(file = "~/Documents/GitHub/customer_loss/data/df_train.RDs")
load(file = "~/Documents/GitHub/customer_loss/data/df_test.RDs")
# 设置种子 --------------------------------------------------------------------
set.seed(6666)
建模(暂未调整参数),导入已经保存模型
load(file = "~/Documents/GitHub/customer_loss/data/xgb_md.model")
计算不同阈值准确率,召回率,精确度函数
# 计算不同阈值准确率,召回率,精确度函数-----------------------------
get_eval <- function(thr, dat, mdl) {
pred <- predict(mdl, newdata = as.matrix(dat[, -1]))
prediction <- as.numeric(pred > thr)
accuracy <- (table(prediction, dat$label)[1, 1] +
table(prediction, dat$label)[2, 2]) / (dat$label %>% length())
recall <- table(prediction, dat$label)[1, 1] / table(dat$label)[1]
precision <- table(prediction, dat$label)[1, 1] / table(prediction)[1]
list(thr = thr, recall = recall, precision = precision, accuracy = accuracy)
}
预测结果
pred_train <- predict(xgb_md2, newdata = as.matrix(df_train[, -1]))
prediction_train <- as.numeric(pred_train > 0.5)
table(prediction_train, df_train$label)
##
## prediction_train 0 1
## 0 355571 99067
## 1 19808 43013
pred_test <- predict(xgb_md2, newdata = as.matrix(df_test[, -1]))
prediction_test <- as.numeric(pred_test > 0.5)
table(prediction_test, df_test$label)
##
## prediction_test 0 1
## 0 118350 33290
## 1 6859 13987
# 阈值范围 --------------------------------------------------------------------
thr_ls <- seq(0.05, 0.6, by = 0.05)
result_train <- thr_ls %>% map(get_eval, dat = df_train, mdl = xgb_md2) %>% bind_rows()
result_test <- thr_ls %>% map(get_eval, dat = df_test, mdl = xgb_md2) %>% bind_rows()
不同阈值下recall,precision
result_train
## # A tibble: 12 x 4
## thr recall precision accuracy
## <dbl> <dbl> <dbl> <dbl>
## 1 0.05 0.0692 0.992 0.324
## 2 0.1 0.210 0.970 0.422
## 3 0.15 0.366 0.942 0.524
## 4 0.2 0.510 0.913 0.609
## 5 0.25 0.632 0.887 0.674
## 6 0.3 0.731 0.862 0.720
## 7 0.35 0.811 0.838 0.749
## 8 0.4 0.874 0.817 0.766
## 9 0.45 0.917 0.798 0.771
## 10 0.5 0.947 0.782 0.770
## 11 0.55 0.968 0.768 0.764
## 12 0.6 0.982 0.756 0.757
result_test
## # A tibble: 12 x 4
## thr recall precision accuracy
## <dbl> <dbl> <dbl> <dbl>
## 1 0.05 0.0660 0.989 0.321
## 2 0.1 0.206 0.962 0.418
## 3 0.15 0.362 0.934 0.518
## 4 0.2 0.506 0.908 0.604
## 5 0.25 0.630 0.882 0.670
## 6 0.3 0.729 0.858 0.716
## 7 0.35 0.809 0.835 0.745
## 8 0.4 0.871 0.813 0.762
## 9 0.45 0.915 0.796 0.768
## 10 0.5 0.945 0.780 0.767
## 11 0.55 0.966 0.767 0.762
## 12 0.6 0.980 0.755 0.754
ROC & AUC
# ROC & AUC----------------------------------------------------------
plot(roc(df_train$label, pred_train), col="blue", ylab = "train_sensitivity")
plot.roc(roc(df_train$label, pred_train),
add = T,
col = "red",
ylab = "train_sensitivity")
auc(roc(df_train$label, pred_train))
## Area under the curve: 0.7893
plot(roc(df_test$label, pred_test), col="blue", ylab = "test_sensitivity")
plot.roc(roc(df_test$label, pred_test),
add = T,
col = "red",
ylab = "test_sensitivity")
auc(roc(df_test$label, pred_test))
## Area under the curve: 0.7801
变量重要性
# 对模型影响最大的变量排序
xgb.importance(colnames(df_train[,-1]), model = xgb_md2)
## Feature Gain Cover Frequency
## 1: cr 0.153651789 0.047566831 0.041361078
## 2: intervals 0.118783815 0.060143559 0.040508272
## 3: h 0.094718212 0.085934130 0.057052703
## 4: visitnum_oneyear 0.064642718 0.071597258 0.077093638
## 5: iforderpv_24h 0.058117165 0.029205510 0.019699812
## 6: ordernum_oneyear 0.055669929 0.028744532 0.032577179
## 7: lasthtlordergap 0.041025620 0.061993323 0.066263005
## 8: sid 0.036329083 0.047257526 0.057820229
## 9: delta_price2 0.033102337 0.017224131 0.012621525
## 10: cityorders 0.032261176 0.030432655 0.038717380
## 11: lastpvgap 0.030984258 0.058553044 0.064557394
## 12: hotelcr 0.027527373 0.028615633 0.007931093
## 13: businessrate_pre2 0.027464538 0.023832944 0.027375064
## 14: ctrip_profits 0.026438912 0.055299986 0.057734948
## 15: landhalfhours 0.020100372 0.026046214 0.020040935
## 16: starprefer 0.017563156 0.021307767 0.039484905
## 17: novoters_pre 0.017305605 0.024518610 0.017482517
## 18: historyvisit_avghotelnum 0.016890755 0.036504439 0.034453352
## 19: historyvisit_visit_detailpagenum 0.016439137 0.019400069 0.035561999
## 20: avgprice 0.014344709 0.028634704 0.039143783
## 21: cr_pre 0.014202642 0.012878769 0.010063108
## 22: price_sensitive 0.013254857 0.013732056 0.034282790
## 23: uv_pre2 0.009866184 0.023878428 0.023025755
## 24: customereval_pre2 0.009512793 0.023686367 0.014241856
## 25: ordercanncelednum 0.007786983 0.007395971 0.019699812
## 26: lowestprice_pre2 0.007647376 0.019632453 0.018250043
## 27: ordercanceledprecent 0.006708844 0.006725483 0.018420604
## 28: firstorder_bu 0.005541326 0.014781626 0.015691625
## 29: lowestprice 0.005027691 0.019493050 0.009551424
## 30: lowestprice_pre 0.004798664 0.013786701 0.013900733
## 31: cancelrate_pre 0.004406408 0.009735586 0.011768719
## 32: deltaprice_pre2_t1 0.003458906 0.021131246 0.013559611
## 33: cancelrate 0.002800655 0.007273567 0.004775712
## 34: hoteluv 0.001626012 0.003055834 0.005287396
## Feature Gain Cover Frequency