In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from holoviews.annotators import preprocess
from material.plugins.search.config import pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
In [4]:
# 读取数据
try:
    train_data = pd.read_csv('data/train.csv')
    test_data = pd.read_csv('data/testA.csv')
    sample_submit = pd.read_csv('data/sample_submit.csv')
    print("数据加载成功")
except FileNotFoundError as e:
    print(f"加载数据错误: {e}")
    raise
数据加载成功
In [5]:
# 处理缺失
train_data = train_data.dropna(subset=['isDefault'])
In [6]:
# 分离特征和标签
X = train_data.drop(['id','isDefault'], axis=1)
y = train_data['isDefault']
X_test = test_data.drop(['id'], axis=1)
In [7]:
# 确定数值和分类特征
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
In [8]:
# 数据特征处理:缺失值填补和标准化
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
# 分类特征处理:缺失值填补和独热编码
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
In [9]:
# 处理应用于特征
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)
In [10]:
# 数据分为训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)
In [30]:
# 构建与处理和模型的pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        n_jobs=-1,      # 压榨CPU所有核心
        max_depth=20,   # 保护内存不溢出
        class_weight='balanced',
        random_state=42
    )),
])
In [31]:
# 模型训练
model.fit(X_train, y_train)
Out[31]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle',
       'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose',
       'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'fico...
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['grade', 'subGrade', 'employmentLength', 'issueDate',
       'earliesCreditLine'],
      dtype='object'))])),
                ('classifier',
                 RandomForestClassifier(class_weight='balanced', max_depth=20,
                                        n_jobs=-1, random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps  [('preprocessor', ...), ('classifier', ...)]
transform_input  None
memory  None
verbose  False
Parameters
transformers  [('num', ...), ('cat', ...)]
remainder  'drop'
sparse_threshold  0.3
n_jobs  None
transformer_weights  None
verbose  False
verbose_feature_names_out  True
force_int_remainder_cols  'deprecated'
Index(['loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle',
       'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose',
       'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7',
       'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')
Parameters
missing_values  nan
strategy  'mean'
fill_value  None
copy  True
add_indicator  False
keep_empty_features  False
Parameters
copy  True
with_mean  True
with_std  True
Index(['grade', 'subGrade', 'employmentLength', 'issueDate',
       'earliesCreditLine'],
      dtype='object')
Parameters
missing_values  nan
strategy  'constant'
fill_value  'missing'
copy  True
add_indicator  False
keep_empty_features  False
Parameters
categories  'auto'
drop  None
sparse_output  True
dtype  <class 'numpy.float64'>
handle_unknown  'ignore'
min_frequency  None
max_categories  None
feature_name_combiner  'concat'
Parameters
n_estimators  100
criterion  'gini'
max_depth  20
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  -1
random_state  42
verbose  0
warm_start  False
class_weight  'balanced'
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
In [38]:
# 在验证集上进行预测
y_pred = model.predict(X_valid)
print(f'Validation Accuracy: {accuracy_score(y_valid, y_pred)}')
Validation Accuracy: 0.65893125
In [33]:
# 对测试集进行预测
test_data_processed = model.named_steps['preprocessor'].transform(X_test)
test_predictions = model.named_steps['classifier'].predict(test_data_processed)
In [39]:
# 打印验证集的准确率
print(f'Validation Accuracy: {accuracy_score(y_valid, y_pred):.4f}')
Validation Accuracy: 0.6589
In [40]:
# 创建提交文件
submission = sample_submit.copy()
submission['isDefault'] = test_predictions
submission.to_csv('submission.csv', index=False)
In [42]:
# 看看 1 和 0 的比例
print("训练集类别分布:")
print(y_train.value_counts())

# 看看你预测出来的结果里有没有哪怕一个 1
print("\n预测结果类别分布:")
print(pd.Series(test_predictions).value_counts())
训练集类别分布:
isDefault
0    512066
1    127934
Name: count, dtype: int64

预测结果类别分布:
0    120207
1     79793
Name: count, dtype: int64