1. 导入需要的库¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# 让图能显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
2. 读取数据并查看基本情况¶
In [2]:
# 读入数据
df = pd.read_csv('credit_card.csv')
print('数据形状:', df.shape)
df.head()
数据形状: (8949, 18)
Out[2]:
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | C10002 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | C10004 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 0.000000 | 0.000000 | 12 |
| 4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
In [3]:
# 查看每列的类型和有多少缺失值
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8949 entries, 0 to 8948 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CUST_ID 8948 non-null object 1 BALANCE 8949 non-null float64 2 BALANCE_FREQUENCY 8949 non-null float64 3 PURCHASES 8949 non-null float64 4 ONEOFF_PURCHASES 8949 non-null float64 5 INSTALLMENTS_PURCHASES 8949 non-null float64 6 CASH_ADVANCE 8949 non-null float64 7 PURCHASES_FREQUENCY 8949 non-null float64 8 ONEOFF_PURCHASES_FREQUENCY 8949 non-null float64 9 PURCHASES_INSTALLMENTS_FREQUENCY 8949 non-null float64 10 CASH_ADVANCE_FREQUENCY 8949 non-null float64 11 CASH_ADVANCE_TRX 8949 non-null int64 12 PURCHASES_TRX 8949 non-null int64 13 CREDIT_LIMIT 8949 non-null float64 14 PAYMENTS 8949 non-null float64 15 MINIMUM_PAYMENTS 8949 non-null float64 16 PRC_FULL_PAYMENT 8949 non-null float64 17 TENURE 8949 non-null int64 dtypes: float64(14), int64(3), object(1) memory usage: 1.2+ MB
In [4]:
# 看看缺失值情况
df.isnull().sum()
Out[4]:
CUST_ID 1 BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 0 PAYMENTS 0 MINIMUM_PAYMENTS 0 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
In [5]:
# 看下数值列的统计信息
df.describe()
Out[5]:
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 |
| mean | 1564.647593 | 0.877350 | 1003.316936 | 592.503572 | 411.113579 | 978.959616 | 0.490405 | 0.202480 | 0.364478 | 0.135141 | 3.249078 | 14.711476 | 4494.449450 | 1733.336511 | 834.075035 | 0.153732 | 11.517935 |
| std | 2081.584016 | 0.236798 | 2136.727848 | 1659.968851 | 904.378205 | 2097.264344 | 0.401360 | 0.298345 | 0.397451 | 0.200132 | 6.824987 | 24.858552 | 3638.815725 | 2895.168146 | 2336.104378 | 0.292511 | 1.337134 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 |
| 25% | 128.365782 | 0.888889 | 39.800000 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 383.282850 | 163.029483 | 0.000000 | 12.000000 |
| 50% | 873.680279 | 1.000000 | 361.490000 | 38.000000 | 89.000000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 857.062706 | 289.686899 | 0.000000 | 12.000000 |
| 75% | 2054.372848 | 1.000000 | 1110.170000 | 577.830000 | 468.650000 | 1113.868654 | 0.916667 | 0.300000 | 0.750000 | 0.222222 | 4.000000 | 17.000000 | 6500.000000 | 1901.279320 | 788.721609 | 0.142857 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
In [6]:
# 删掉CUST_ID列
data = df.drop('CUST_ID', axis=1)
# 用中位数填充缺失值
data = data.fillna(data.median())
print('处理后还有缺失值吗?', data.isnull().sum().sum())
处理后还有缺失值吗? 0
In [7]:
# 标准化
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
# 转回DataFrame方便查看
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)
data_scaled.head()
Out[7]:
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.732054 | -0.249881 | -0.424934 | -0.356957 | -0.349114 | -0.466805 | -0.806649 | -0.678716 | -0.707409 | -0.675294 | -0.476083 | -0.511381 | -0.960380 | -0.529026 | -0.297334 | -0.525588 | 0.360541 |
| 1 | 0.786858 | 0.134049 | -0.469584 | -0.356957 | -0.454607 | 2.605438 | -1.221928 | -0.678716 | -0.917090 | 0.573949 | 0.110032 | -0.591841 | 0.688601 | 0.818546 | 0.101998 | 0.234159 | 0.360541 |
| 2 | 0.447041 | 0.517980 | -0.107716 | 0.108843 | -0.454607 | -0.466805 | 1.269742 | 2.673295 | -0.917090 | -0.675294 | -0.476083 | -0.109082 | 0.826016 | -0.383857 | -0.088524 | -0.525588 | 0.360541 |
| 3 | 0.049015 | -1.017743 | 0.231995 | 0.546123 | -0.454607 | -0.368678 | -1.014290 | -0.399383 | -0.917090 | -0.258882 | -0.329554 | -0.551611 | 0.826016 | -0.598733 | -0.357057 | -0.525588 | 0.360541 |
| 4 | -0.358849 | 0.517980 | -0.462095 | -0.347317 | -0.454607 | -0.466805 | -1.014290 | -0.399383 | -0.917090 | -0.675294 | -0.476083 | -0.551611 | -0.905414 | -0.364421 | -0.252265 | -0.525588 | 0.360541 |
4. 用肘部法则确定最佳K值¶
通过画出不同K值下的SSE(簇内平方和),找拐点位置的K作为最佳聚类数。
In [8]:
# 试K从2到10
sse = []
k_range = range(2, 11)
for k in k_range:
km = KMeans(n_clusters=k, random_state=42, n_init=10)
km.fit(data_scaled)
sse.append(km.inertia_) # inertia_就是SSE
# 画肘部图
plt.figure(figsize=(8, 5))
plt.plot(k_range, sse, marker='o')
plt.xlabel('K值(聚类数)')
plt.ylabel('SSE(簇内平方和)')
plt.title('肘部法则确定最佳K值')
plt.grid(True)
plt.show()
从上图看,K=4时曲线下降变缓,所以选 K=4 比较合适。
5. 用K=4进行聚类¶
In [9]:
# 训练K-means模型
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
labels = kmeans.fit_predict(data_scaled)
# 把聚类结果加回原数据
data['cluster'] = labels
# 看每个簇有多少人
print('每个簇的用户数:')
print(data['cluster'].value_counts().sort_index())
每个簇的用户数: cluster 0 409 1 3366 2 1197 3 3977 Name: count, dtype: int64
6. 各簇用户特征分析¶
In [10]:
# 计算每个簇各字段的平均值
cluster_mean = data.groupby('cluster').mean()
cluster_mean
Out[10]:
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| cluster | |||||||||||||||||
| 0 | 3551.153761 | 0.986879 | 7681.620098 | 5095.878826 | 2587.208264 | 653.638891 | 0.946418 | 0.739031 | 0.788060 | 0.071290 | 2.085575 | 89.359413 | 9696.943765 | 7288.739497 | 1970.476256 | 0.286707 | 11.951100 |
| 1 | 894.768927 | 0.934715 | 1236.263333 | 593.995933 | 642.541696 | 209.816318 | 0.885255 | 0.297109 | 0.711930 | 0.042487 | 0.789067 | 22.092692 | 4214.013741 | 1331.325429 | 633.801648 | 0.269313 | 11.594474 |
| 2 | 4602.449658 | 0.968389 | 501.862982 | 320.188797 | 181.759123 | 4521.509581 | 0.287832 | 0.138911 | 0.185671 | 0.484792 | 14.294904 | 7.665831 | 7546.160857 | 3484.054216 | 2000.543192 | 0.034888 | 11.386800 |
| 3 | 1012.995969 | 0.790134 | 270.281184 | 210.068828 | 60.479311 | 597.154288 | 0.170292 | 0.086343 | 0.080662 | 0.114888 | 2.126226 | 2.907971 | 3278.263348 | 975.328059 | 535.626389 | 0.078002 | 11.448076 |
In [11]:
# 画热力图直观看每个簇的特点
# 这里先把均值标准化一下,方便对比
cluster_mean_scaled = (cluster_mean - cluster_mean.mean()) / cluster_mean.std()
plt.figure(figsize=(10, 8))
sns.heatmap(cluster_mean_scaled.T, annot=True, cmap='RdBu_r', center=0, fmt='.2f')
plt.title('各簇用户特征热力图(标准化后)')
plt.xlabel('簇')
plt.ylabel('特征')
plt.show()
7. 用PCA降维可视化聚类结果¶
因为数据有17个特征不好画图,用PCA降到2维就能可视化了。
In [12]:
# PCA降维到2个主成分
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_scaled)
# 画散点图
plt.figure(figsize=(10, 6))
for i in range(4):
plt.scatter(data_pca[labels == i, 0], data_pca[labels == i, 1],
label=f'簇 {i}', alpha=0.5)
plt.xlabel('主成分1')
plt.ylabel('主成分2')
plt.title('K-means聚类结果(PCA降维到2维)')
plt.legend()
plt.grid(True)
plt.show()
print(f'两个主成分一共解释了 {pca.explained_variance_ratio_.sum()*100:.2f}% 的方差')
两个主成分一共解释了 47.62% 的方差
8. 聚类结果解读与营销策略¶
结合上面的均值表和热力图,对 4 个簇分别画像如下(簇编号和上面表格一致):
簇 0:高端购物达人(409 人,约 4.6%)¶
- 特征:购买金额最高(约 7681),一次性购买(5095)和分期购买(2587)都遥遥领先
- 购买频率 0.95,购买交易笔数 89 笔,活跃度极高
- 信用额度最高(约 9696),还款金额最高(7288)
- 营销策略:推荐联名高端信用卡、积分加倍、商旅权益,绑定他们的忠诚度;可推大额免息分期
簇 1:日常活跃用户(3366 人,约 37.6%)¶
- 特征:购买金额中等(1236),但购买频率高达 0.88,分期付款频率 0.71
- 现金预借很少,行为健康
- 信用额度中等(4214)
- 营销策略:日常消费返现、商超联合优惠、小额免息分期,挖掘他们的消费潜力
簇 2:预借现金重度用户(1197 人,约 13.4%)¶
- 特征:现金预借金额最高(4521),现金预借频率最高(0.48),交易次数 14 次
- 购买金额却很低(501),余额最高(4602)
- 全额还款比例最低(仅 0.03),有较高的逾期风险
- 营销策略:重点风险监控,限制额度提升;可定向推现金分期产品,但要严格审核还款能力
簇 3:低活跃沉睡用户(3977 人,约 44.4%)¶
- 特征:用户数最多,但购买金额最低(270),购买频率最低(0.17)
- 信用额度也最低(3278),整体不活跃
- 营销策略:发新人优惠券、首刷礼、生日礼等激励活动,唤醒他们多刷卡
总结¶
通过 K-means 聚类把信用卡用户分成 4 类:
- 高端购物达人(簇 0)和日常活跃用户(簇 1)是优质客户,应重点维护
- 预借现金重度用户(簇 2)需要重点风险管控
- 低活跃用户(簇 3)人数最多,需要通过营销活动激活
针对不同人群制定不同的营销策略,可以在提高营销效率的同时,更好地控制信用风险。