In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from holoviews.annotators import preprocess
from material.plugins.search.config import pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
In [4]:
# 读取数据
try:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/testA.csv')
sample_submit = pd.read_csv('data/sample_submit.csv')
print("数据加载成功")
except FileNotFoundError as e:
print(f"加载数据错误: {e}")
raise
数据加载成功
In [5]:
# 处理缺失
train_data = train_data.dropna(subset=['isDefault'])
In [6]:
# 分离特征和标签
X = train_data.drop(['id','isDefault'], axis=1)
y = train_data['isDefault']
X_test = test_data.drop(['id'], axis=1)
In [7]:
# 确定数值和分类特征
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
In [8]:
# 数据特征处理:缺失值填补和标准化
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
# 分类特征处理:缺失值填补和独热编码
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
In [9]:
# 处理应用于特征
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
]
)
In [10]:
# 数据分为训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(
X, y,
test_size=0.2,
random_state=42
)
In [30]:
# 构建与处理和模型的pipeline
model = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(
n_estimators=100,
n_jobs=-1, # 压榨CPU所有核心
max_depth=20, # 保护内存不溢出
class_weight='balanced',
random_state=42
)),
])
In [31]:
# 模型训练
model.fit(X_train, y_train)
Out[31]:
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle',
'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose',
'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
'fico...
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['grade', 'subGrade', 'employmentLength', 'issueDate',
'earliesCreditLine'],
dtype='object'))])),
('classifier',
RandomForestClassifier(class_weight='balanced', max_depth=20,
n_jobs=-1, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| steps | [('preprocessor', ...), ('classifier', ...)] | |
| transform_input | None | |
| memory | None | |
| verbose | False |
Parameters
| transformers | [('num', ...), ('cat', ...)] | |
| remainder | 'drop' | |
| sparse_threshold | 0.3 | |
| n_jobs | None | |
| transformer_weights | None | |
| verbose | False | |
| verbose_feature_names_out | True | |
| force_int_remainder_cols | 'deprecated' |
Index(['loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle',
'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose',
'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7',
'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
dtype='object')Parameters
| missing_values | nan | |
| strategy | 'mean' | |
| fill_value | None | |
| copy | True | |
| add_indicator | False | |
| keep_empty_features | False |
Parameters
| copy | True | |
| with_mean | True | |
| with_std | True |
Index(['grade', 'subGrade', 'employmentLength', 'issueDate',
'earliesCreditLine'],
dtype='object')Parameters
| missing_values | nan | |
| strategy | 'constant' | |
| fill_value | 'missing' | |
| copy | True | |
| add_indicator | False | |
| keep_empty_features | False |
Parameters
| categories | 'auto' | |
| drop | None | |
| sparse_output | True | |
| dtype | <class 'numpy.float64'> | |
| handle_unknown | 'ignore' | |
| min_frequency | None | |
| max_categories | None | |
| feature_name_combiner | 'concat' |
Parameters
| n_estimators | 100 | |
| criterion | 'gini' | |
| max_depth | 20 | |
| min_samples_split | 2 | |
| min_samples_leaf | 1 | |
| min_weight_fraction_leaf | 0.0 | |
| max_features | 'sqrt' | |
| max_leaf_nodes | None | |
| min_impurity_decrease | 0.0 | |
| bootstrap | True | |
| oob_score | False | |
| n_jobs | -1 | |
| random_state | 42 | |
| verbose | 0 | |
| warm_start | False | |
| class_weight | 'balanced' | |
| ccp_alpha | 0.0 | |
| max_samples | None | |
| monotonic_cst | None |
In [38]:
# 在验证集上进行预测
y_pred = model.predict(X_valid)
print(f'Validation Accuracy: {accuracy_score(y_valid, y_pred)}')
Validation Accuracy: 0.65893125
In [33]:
# 对测试集进行预测
test_data_processed = model.named_steps['preprocessor'].transform(X_test)
test_predictions = model.named_steps['classifier'].predict(test_data_processed)
In [39]:
# 打印验证集的准确率
print(f'Validation Accuracy: {accuracy_score(y_valid, y_pred):.4f}')
Validation Accuracy: 0.6589
In [40]:
# 创建提交文件
submission = sample_submit.copy()
submission['isDefault'] = test_predictions
submission.to_csv('submission.csv', index=False)
In [42]:
# 看看 1 和 0 的比例
print("训练集类别分布:")
print(y_train.value_counts())
# 看看你预测出来的结果里有没有哪怕一个 1
print("\n预测结果类别分布:")
print(pd.Series(test_predictions).value_counts())
训练集类别分布: isDefault 0 512066 1 127934 Name: count, dtype: int64 预测结果类别分布: 0 120207 1 79793 Name: count, dtype: int64