diff --git a/README.md b/README.md new file mode 100644 index 0000000..7bbe0b9 --- /dev/null +++ b/README.md @@ -0,0 +1,113 @@ +# CPC 预后分类(影像组学 + NSE) + +基于 CT 影像组学特征与 NSE 实验室指标,对心脏骤停患者 CPC 评分进行二分类(CPC 1–2 vs 3–5),支持多种机器学习模型与 LASSO 特征筛选。 + +## 仓库结构 + +``` +code/cls/ +├── config/ # 实验默认配置(标签分箱、列名等) +├── models/ # 分类器工厂(统一超参入口) +├── pipeline/ # 数据准备 + 训练评估主流程 +├── utils/ # 工具函数(LASSO、指标、结果保存) +│ ├── pre4data.py +│ ├── util.py +│ └── tools/ # 数据对齐、NSE 处理等脚本 +├── classify_lab1.py # 固定 train/test + 可选 NSE(lab1) +├── classify_single.py# 单文件划分 train/test +├── classify_kfold.py # 5 折交叉验证(无 NSE) +├── classify_kfold_nse.py # 5 折 + NSE +├── run_experiment.py # 统一 CLI(任意 profile) +├── config/experiments.yaml # 路径与实验配置(外置) +├── feature_process.py# 特征表合并与清洗(数据预处理) +├── new_data_process.py +├── train_test.py # 划分并导出 train/test Excel +└── requirements.txt +``` + +## 环境 + +```bash +cd code/cls +pip install -r requirements.txt +``` + +## 配置路径(YAML + 环境变量) + +所有实验参数集中在 `code/cls/config/experiments.yaml`。路径使用占位符,无需改 Python 代码: + +| 变量 | 含义 | 默认值 | +|------|------|--------| +| `CLS_DATA_ROOT` | Excel 数据目录 | `./data` | +| `CLS_RESULTS_ROOT` | 结果输出根目录 | `./results` | +| `CLS_EXPERIMENT` | 默认 profile | 各入口脚本不同 | + +YAML 内可使用 `{data_root}/文件名.xlsx`,或 `${CLS_DATA_ROOT:-./data}/文件名.xlsx`。 + +**Windows 示例** + +```bat +set CLS_DATA_ROOT=D:\thrid_beijing_hospital_data +set CLS_RESULTS_ROOT=D:\thrid_beijing_hospital_data +python classify_lab1.py +``` + +**Linux / macOS 示例** + +```bash +export CLS_DATA_ROOT=/path/to/your/data +export CLS_RESULTS_ROOT=/path/to/your/results +python run_experiment.py -p kfold_nse +``` + +查看可用 profile: + +```bash +python run_experiment.py --list-profiles +``` + +| Profile | 说明 | +|---------|------| +| `lab1` | 固定 train/test,含 NSE | +| `single` | 单表 8:2 分层划分 | +| `kfold` | 5 折交叉验证(仅 CT 特征) | +| `kfold_nse` | 5 折 + NSE(每折独立 LASSO 与 NSE 拼接) | + +修改或新增实验:编辑 `config/experiments.yaml` 中 `experiments` 节点,或复制一份自定义 YAML 并用 `-c` 指定。 + +## 运行实验 + +在 `code/cls` 目录下执行: + +```bash +pip install -r requirements.txt +python classify_lab1.py # 等同: python run_experiment.py -p lab1 +python classify_single.py +python classify_kfold.py +python classify_kfold_nse.py # 5 折 + NSE +python run_experiment.py -p lab1 -c config/experiments.yaml +``` + +### 可选分类器 + +`classifier` 取值:`svm` | `logistic` | `gaussian_nb` | `xgboost` | `lightgbm` | `catboost` + +### 标签定义 + +默认:CPC 1–2 → 0,CPC 3–5 → 1。可在 `ExperimentConfig` 中调整 `cpc_bins` / `cpc_labels`。 + +## 设计说明 + +- **配置与逻辑分离**:路径、模型、是否使用 NSE 等集中在 `ExperimentConfig`,避免在多个脚本中复制数百行训练代码。 +- **统一流水线**:`pipeline.experiment.run_experiment` 负责 LASSO、标准化、训练、指标汇总与结果写入。 +- **可扩展**:新增模型只需在 `models/factory.py` 注册;新增实验类型可复用 `pipeline.data` 与 `_run_single_fold`。 + +## 数据预处理脚本 + +| 脚本 | 用途 | +|------|------| +| `feature_process.py` | 多时间点影像组学表合并、剔除 diagnostics 列 | +| `feature_process.py` / `new_data_process.py` | 院内数据流水线(需配置本地 Excel 路径) | +| `utils/tools/*` | CTid–姓名–NSE 对齐等 | + +预处理脚本(`feature_process.py`、`utils/tools/*`)仍可能含历史绝对路径;建议同样改为读取 `CLS_DATA_ROOT` 或从 `config/experiments.yaml` 的 `paths` 段复制路径约定。 diff --git a/code/cls/.gitignore b/code/cls/.gitignore new file mode 100644 index 0000000..a2c0a65 --- /dev/null +++ b/code/cls/.gitignore @@ -0,0 +1,9 @@ +__pycache__/ +*.py[cod] +.pytest_cache/ +.venv/ +venv/ +results/ +results_*/ +*.xlsx +!data/.gitkeep diff --git a/code/cls/classify_kfold.py b/code/cls/classify_kfold.py index e6e94f7..2e1cce1 100644 --- a/code/cls/classify_kfold.py +++ b/code/cls/classify_kfold.py @@ -1,207 +1,6 @@ -import pandas as pd -from sklearn.naive_bayes import GaussianNB -from sklearn.preprocessing import StandardScaler -import numpy as np -from sklearn.linear_model import LogisticRegression -import os -from utils.pre4data import lasso_dimension_reduction, if_same -import xgboost as xgb -from sklearn.svm import SVC -from lightgbm import LGBMClassifier -from catboost import CatBoostClassifier -from sklearn.model_selection import train_test_split +"""5-fold cross-validation without NSE (profile: kfold).""" -from utils.util import get_next_result_folder, save_results, calculate_metrics - - -def main(): - data_train_path = './0721/0728data_delete.xlsx' - ct_mode = data_train_path.split('data')[0].split('/')[-1] - data_train = pd.read_excel(data_train_path) - - # CPC 1-2 --> label 0, CPC3-5 --> label 1 这里你可以参照classify_lab1.py设置不同的label - data_train['label'] = pd.cut(data_train['CPC'], bins=[0, 2, 5], labels=[1, 0]) - label0_num = len(data_train[data_train['label'] == 0]) - label1_num = len(data_train[data_train['label'] == 1]) - data_train = data_train.drop(['CPC'], axis=1) - print(f"训练集形状: {data_train.shape}, 训练集时间: {ct_mode}") - result_folder = get_next_result_folder(base_path='./results_0728_delete') - - y_index_s = data_train.iloc[:, -1] - scale_pos_weight = len(y_index_s[y_index_s == 0]) / len(y_index_s[y_index_s == 1]) - scaler = StandardScaler() - print(f"数据划分完毕") - - # 初始化分类器 - ratio0 = len([x for x in data_train['label'].tolist() if x == 0]) / len(data_train['label'].tolist()) - ratio1 = len([x for x in data_train['label'].tolist() if x == 1]) / len(data_train['label'].tolist()) - clf = GaussianNB(priors=[ratio0, ratio1]) # priors=[0.5, 0.5] - clf = LogisticRegression() - - lgbm_params = { - 'objective': 'binary', # 二分类任务 - 'metric': 'binary_logloss', # 使用logloss作为评价指标 - 'learning_rate': 0.016, - 'max_depth': 6, - 'n_estimators': 500, - 'subsample': 0.7, # 构建每棵树时使用的样本比例 - 'colsample_bytree': 0.7, # 每棵树使用的特征比例 - 'scale_pos_weight': scale_pos_weight, # 根据数据不平衡调整正负样本的权重 - 'random_state': 42, - } - clf = LGBMClassifier(**lgbm_params) - - catboost_params = { - 'iterations': 500, # 迭代次数 - 'depth': 6, # 树的深度 - 'learning_rate': 0.01, # 学习率 - 'loss_function': 'Logloss', # 损失函数 - 'eval_metric': 'AUC', # AUC作为评价指标 - 'scale_pos_weight': scale_pos_weight, # 样本不平衡的调整 - 'random_seed': 42, - 'verbose': 0 # 不输出训练过程 - } - clf = CatBoostClassifier(**catboost_params) - - xgb_params = { - 'objective': 'binary:logistic', 'eval_metric': ['logloss'], - 'learning_rate': 0.016, - 'max_depth': 6, - 'n_estimators': 600, - 'subsample': 0.72, # 用于构建每棵树的样本比例 - 'colsample_bytree': 0.705, # 控制每棵树在构建时使用的特征比例 - # 'scale_pos_weight': 1.4, # 根据实际正负样本比例设置权重 len(y[y==0]) / len(y[y==1]), - 'gamma': 0.1, - 'min_child_weight': 1, # 降低以增加模型灵活性 - 'scale_pos_weight': scale_pos_weight, - 'random_state': 42, - # 'tree_method': 'hist', - # 'device': 'cuda', - } - clf = xgb.XGBClassifier(**xgb_params) - - clf = SVC( - kernel='rbf', # 使用RBF核函数 - C=10, # 正则化参数 - gamma='auto', # scale - probability=True, # 启用概率估计 - class_weight='balanced', # 处理类别不平衡 - random_state=42) - - mode = clf.__class__.__name__ - parameter_clf = clf.get_params() - print(f"classifier is: {mode}") - acc_scores = [] - recall_scores = [] - specificity_scores = [] - precision_scores = [] - npv_scores = [] - auc_scores = [] - selected_features_all = [] - save_path = './results/roc_curve_{}_time_{}'.format(mode, ct_mode) - os.makedirs(save_path, exist_ok=True) - - random_states = [3, 13, 42, 87, 1307] - results = "classifier mode is: {}\n\nrandom_states: {}\n".format(mode, random_states) - for fold in range(1, 6): - print(f"==============第 {fold} 次实验==============") - X = data_train.drop('label', axis=1) - y = data_train['label'] - X_train, X_val, y_train, y_val = train_test_split( - X, y, - test_size=0.2, # 20% 验证集 - stratify=y, - shuffle=True, - random_state=random_states[fold-1] # 用于复现以及设置不同的实验 - ) - train_index = X_train.index - lasso_train = data_train.iloc[train_index] - results += "\ntrain index: \n{}\ntest index: \n{}\n".format(X_train.index.tolist(), X_val.index.tolist()) - results += f"train: label 0 num: {y_train.values.tolist().count(0)}, label 1 num: {y_train.values.tolist().count(1)}\n" - results += f"test : label 0 num: {y_val.values.tolist().count(0)}, label 1 num: {y_val.values.tolist().count(1)}\n\n\n" - - # 特征降维, 需要先根据原始CT特征进行降维, 然后再把其他需要增加的特征与降维后的特征进行拼接(需要通过CTid对齐人名, 可通过pd.merge实现) - print(f"data for {fold} time, train shape is : {X_train.shape}") - data_train_lasso, selected_features, best_alphas = lasso_dimension_reduction(lasso_train) - X_train = data_train_lasso.iloc[:, :-1] - y_train = data_train_lasso.iloc[:, -1] - print(f"data train shape is : {X_train.shape}") - - print(f"data for {fold} time, selected features num is : {len(selected_features)}") - X_val = X_val[selected_features] - - t = if_same(X_train, X_val) - if t: - print(f"X_train and X_val is same") - else: - print(f"X_train is not same as X_val") - print(f"for {fold} time, X_test.shape: {X_val.shape}, X_train.shape: {X_train.shape}") - - # 标准化 - X_train_scaled = scaler.fit_transform(X_train) - X_val_scaled = scaler.transform(X_val) - - sample_weights = np.ones(len(y_train)) - sample_weights[y_train == 0] = len(y_train) / (2 * (y_train == 0).sum()) - sample_weights[y_train == 1] = len(y_train) / (2 * (y_train == 1).sum()) - - print(f"训练开始") - clf.fit(X_train_scaled, y_train, sample_weight=sample_weights) #贝叶斯分类器 , logic分类器 - # clf.fit(X_train_scaled, y_train) #SVM, xgb, lgbm, catboost分类器 - print(f"训练完成") - - y_pred = clf.predict(X_val_scaled) - y_prob = clf.predict_proba(X_val_scaled)[:, -1] - - ACC, Recall, Specificity, Precision, NPV, roc_auc = calculate_metrics(y_val, y_pred, y_prob, save_roc_path=save_path, mode=mode) - - acc_scores.append(ACC) - recall_scores.append(Recall) - specificity_scores.append(Specificity) - precision_scores.append(Precision) - npv_scores.append(NPV) - auc_scores.append(roc_auc) - selected_features_all.append(f"次数: {fold}, number: {len(selected_features)}, features: {selected_features}\n") - - print(f"第 {fold} 次实验ACC:{ACC:.3f}") - print(f"第 {fold} 次实验Recall:{Recall:.3f}") - print(f"第 {fold} 次实验Specificity:{Specificity:.3f}") - print(f"第 {fold} 次实验Precision:{Precision:.3f}") - print(f"第 {fold} 次实验NPV:{NPV:.3f}") - print(f"第 {fold} 次实验AUC:{roc_auc:.3f}") - - final_ACC = np.mean(acc_scores) - final_Recall = np.mean(recall_scores) - final_Specificity = np.mean(specificity_scores) - final_Precision = np.mean(precision_scores) - final_NPV = np.mean(npv_scores) - final_AUC = np.mean(auc_scores) - print("\n最终测试集的具体指标值:") - print(f"准确率 (ACC): {final_ACC:.3f} ± {np.std(acc_scores):.3f}") - print(f"召回率 (Recall): {final_Recall:.3f} ± {np.std(recall_scores):.3f}") - print(f"特异性 (Specificity): {final_Specificity:.3f} ± {np.std(specificity_scores):.3f}") - print(f"精确率 (PPV): {final_Precision:.3f} ± {np.std(precision_scores):.3f}") - print(f"阴性预测值 (NPV): {final_NPV:.3f} ± {np.std(npv_scores):.3f}") - print(f"AUC值: {final_AUC:.3f} ± {np.std(auc_scores):.3f}") - - final_results = { - 'Recall': f"{final_Recall:.3f} ± {np.std(recall_scores):.3f}", - 'Specificity': f"{final_Specificity:.3f} ± {np.std(specificity_scores):.3f}", - 'ACC': f"{final_ACC:.3f} ± {np.std(acc_scores):.3f}", - 'PPV': f"{final_Precision:.3f} ± {np.std(precision_scores):.3f}", - 'NPV': f"{final_NPV:.3f} ± {np.std(npv_scores):.3f}", - 'AUC': f"{final_AUC:.3f} ± {np.std(auc_scores):.3f}\n", } - - results += "all dataset label 0 num: {}, label 1 num: {}\n".format(label0_num, label1_num) - print(f"all dataset label 0 num: {label0_num}, label 1 num: {label1_num}\n") - for metric, value in final_results.items(): - results += f"\n{metric}: {value}\n" - for feature in selected_features_all: - results += f"{feature}\n" - results += "\n\n" - for pm in parameter_clf: - results += "\nparameter: {} \n{} values is: {}\n".format(pm, pm, parameter_clf[pm]) - save_results(results, result_folder) +from config.cli import main if __name__ == "__main__": - main() \ No newline at end of file + main(default_profile="kfold") diff --git a/code/cls/classify_kfold_nse.py b/code/cls/classify_kfold_nse.py new file mode 100644 index 0000000..fc1cf4e --- /dev/null +++ b/code/cls/classify_kfold_nse.py @@ -0,0 +1,6 @@ +"""5-fold cross-validation with NSE features (profile: kfold_nse).""" + +from config.cli import main + +if __name__ == "__main__": + main(default_profile="kfold_nse") diff --git a/code/cls/classify_lab1.py b/code/cls/classify_lab1.py index df7534d..2e8fa65 100644 --- a/code/cls/classify_lab1.py +++ b/code/cls/classify_lab1.py @@ -1,248 +1,6 @@ -import pandas as pd -from sklearn.naive_bayes import GaussianNB -from sklearn.preprocessing import StandardScaler -import numpy as np -from sklearn.linear_model import LogisticRegression -import os -from datetime import datetime -from utils.pre4data import lasso_dimension_reduction, if_same -import xgboost as xgb -import matplotlib.pyplot as plt -from sklearn.svm import SVC -from lightgbm import LGBMClassifier -from catboost import CatBoostClassifier +"""Lab1: fixed train/test split with optional NSE (profile: lab1).""" -from utils.util import get_next_result_folder, save_results, calculate_metrics - - -def main(): - nseif = True - - data_train_path = 'D:/thrid_beijing_hospital_data/0804lab1-train.xlsx' - data_test_path = 'D:/thrid_beijing_hospital_data/0804lab1-test.xlsx' - base_dir = 'D:/thrid_beijing_hospital_data/results_0804_lab1' - ct_mode = data_train_path.split('-')[0].split('/')[-1] - data_train = pd.read_excel(data_train_path) - data_test = pd.read_excel(data_test_path) - if nseif: - lab_describe = f'cpc1-2=0_cpc3-5=1_lab1_withnse' - train_nse = data_train[['nse极值', 'nse极值差']] - test_nse = data_test[['nse极值', 'nse极值差']] - data_train = data_train.drop(columns=['CTid', 'name', 'nse极值', 'nse极值差']) - data_test = data_test.drop(columns=['CTid', 'name', 'nse极值', 'nse极值差']) - else: - data_train = data_train.drop(columns=['CTid', 'name']) - data_test = data_test.drop(columns=['CTid', 'name']) - lab_describe = f'cpc1-2=0_cpc3-5=1_lab1_withoutnse' - - # without cpc5 -------> means dead people data - # train_df1 = train_df1[train_df1['CPC'] != 5] - # test_df1 = test_df1[test_df1['CPC'] != 5] - - # ============================================= set dataset cpc split ============================================= - # CPC 1-2 --> label 0, CPC3-5 --> label 1 - data_train['label'] = pd.cut(data_train['CPC'], bins=[0, 2, 5], labels=[0, 1]) - data_test['label'] = pd.cut(data_test['CPC'], bins=[0, 2, 5], labels=[0, 1]) - - # CPC 1-4 --> label 0, CPC5 --> label 1 - # train_df1['label'] = pd.cut(train_df1['CPC'], bins=[0, 4, 5], labels=[0, 1]) - # test_df1['label'] = pd.cut(test_df1['CPC'], bins=[0, 4, 5], labels=[0, 1]) - - # CPC 1-2 --> label 0, CPC3-4 --> label 1 - # train_df1['label'] = pd.cut(train_df1['CPC'], bins=[0, 2, 4], labels=[0, 1]) - # test_df1['label'] = pd.cut(test_df1['CPC'], bins=[0, 2, 4], labels=[0, 1]) - # ============================================= set dataset cpc split ============================================= - - data_train = data_train.drop(['CPC'], axis=1) - data_test = data_test.drop(['CPC'], axis=1) - - # dataset - print(f"训练集形状: {data_train.shape}, 训练集时间: {ct_mode}") - result_folder = get_next_result_folder(base_path=base_dir) - - y_index_s = data_train.iloc[:, -1] - scale_pos_weight = len(y_index_s[y_index_s == 0]) / len(y_index_s[y_index_s == 1]) - scaler = StandardScaler() - print(f"数据划分完毕") - - # 初始化分类器 - ratio0 = len([x for x in data_train['label'].tolist() if x == 0]) / len(data_train['label'].tolist()) - ratio1 = len([x for x in data_train['label'].tolist() if x == 1]) / len(data_train['label'].tolist()) - clf = GaussianNB(priors=[ratio0, ratio1]) # priors=[0.5, 0.5] - clf = LogisticRegression() - - lgbm_params = { - 'objective': 'binary', # 二分类任务 - 'metric': 'binary_logloss', # 使用logloss作为评价指标 - 'learning_rate': 0.016, - 'max_depth': 6, - 'n_estimators': 500, - 'subsample': 0.7, # 构建每棵树时使用的样本比例 - 'colsample_bytree': 0.7, # 每棵树使用的特征比例 - 'scale_pos_weight': scale_pos_weight, # 根据数据不平衡调整正负样本的权重 - 'random_state': 42, - } - clf = LGBMClassifier(**lgbm_params) - - catboost_params = { - 'iterations': 500, # 迭代次数 - 'depth': 6, # 树的深度 - 'learning_rate': 0.01, # 学习率 - 'loss_function': 'Logloss', # 损失函数 - 'eval_metric': 'AUC', # AUC作为评价指标 - 'scale_pos_weight': scale_pos_weight, # 样本不平衡的调整 - 'random_seed': 42, - 'verbose': 0 # 不输出训练过程 - } - clf = CatBoostClassifier(**catboost_params) - - xgb_params = { - 'objective': 'binary:logistic', 'eval_metric': ['logloss'], - 'learning_rate': 0.016, - 'max_depth': 6, - 'n_estimators': 600, - 'subsample': 0.72, # 用于构建每棵树的样本比例 - 'colsample_bytree': 0.705, # 控制每棵树在构建时使用的特征比例 - # 'scale_pos_weight': 1.4, # 根据实际正负样本比例设置权重 len(y[y==0]) / len(y[y==1]), - 'gamma': 0.1, - 'min_child_weight': 1, # 降低以增加模型灵活性 - 'scale_pos_weight': scale_pos_weight, - 'random_state': 42, - # 'tree_method': 'hist', - # 'device': 'cuda', - } - clf = xgb.XGBClassifier(**xgb_params) - - clf = SVC( - kernel='rbf', # 使用RBF核函数 - C=10, # 正则化参数 - gamma='auto', # scale - probability=True, # 启用概率估计 - class_weight='balanced', # 处理类别不平衡 - random_state=42) - - random_var = [42, 46, 52] - - # 参数网格 - param_grid = { - 'learning_rate': [0.01, 0.05, 0.1], - 'max_depth': [5, 6, 7], - 'n_estimators': [100, 200, 500], - 'subsample': [0.8, 0.9, 1.0], - 'colsample_bytree': [0.8, 0.9, 1.0], - } - mode = clf.__class__.__name__ - parameter_clf = clf.get_params() - print(f"classifier is: {mode}") - acc_scores = [] - recall_scores = [] - specificity_scores = [] - precision_scores = [] - npv_scores = [] - auc_scores = [] - selected_features_all = [] - save_path = os.path.join(result_folder, 'roc_curve_{}_time_{}_{}_{}'.format(mode, ct_mode, lab_describe, datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) - os.makedirs(save_path, exist_ok=True) - random_states = 1307 - - results = "lab: {}\nclassifier mode is: {}\n\nrandom_states: {}, train num: {}, test num: {}\n".format(lab_describe, mode, random_states, len(data_train), len(data_test)) - X_train = data_train.drop('label', axis=1) - y_train = data_train['label'] - X_test = data_test.drop('label', axis=1) - y_test = data_test['label'] - - # save progress - results += "\ntrain index: \n{}\ntest index: \n{}\n".format(X_train.index.tolist(), X_test.index.tolist()) - results += f"train: label 0 num: {y_train.values.tolist().count(0)} ratio: {y_train.values.tolist().count(0) / len(y_train):.4f}, label 1 num: {y_train.values.tolist().count(1)} ratio: {y_train.values.tolist().count(1) / len(y_train):.4f}\n" - results += f"test : label 0 num: {y_test.values.tolist().count(0)} ratio: {y_test.values.tolist().count(0) / len(y_test):.4f}, label 1 num: {y_test.values.tolist().count(1)} ratio: {y_test.values.tolist().count(1) / len(y_test):.4f}\n\n\n" - - # 特征降维 - print(f"data train shape is : {X_train.shape}") - data_train_lasso, selected_features, best_alphas = lasso_dimension_reduction(data_train) - if nseif: - print(f"..........with nse data training...") - results+=f"..........with nse data training..." - data_train_withnse = pd.concat([data_train_lasso, train_nse], axis=1) - cols = [c for c in data_train_withnse.columns if c != 'label'] + ['label'] - data_train_final = data_train_withnse[cols] - X_test = X_test[selected_features] - X_test = pd.concat([X_test, test_nse], axis=1) - - else: - print(f"..........without nse data training...") - results+=f"..........without nse data training..." - data_train_final = data_train_lasso - X_test = X_test[selected_features] - - X_train = data_train_final.iloc[:, :-1] - y_train = data_train_final.iloc[:, -1] - print(f"data train shape is : {X_train.shape}") - print(f"data selected features num is : {len(selected_features)}") - - t = if_same(X_train, X_test) - if t: - print(f"X_train and X_val is same") - else: - print(f"X_train is not same as X_val") - print(f"X_test.shape: {X_test.shape}, X_train.shape: {X_train.shape}") - - # 标准化 - X_train_scaled = scaler.fit_transform(X_train) - X_val_scaled = scaler.transform(X_test) - - sample_weights = np.ones(len(y_train)) - sample_weights[y_train == 0] = len(y_train) / (2 * (y_train == 0).sum()) - sample_weights[y_train == 1] = len(y_train) / (2 * (y_train == 1).sum()) - - print(f"训练开始") - clf.fit(X_train_scaled, y_train, sample_weight=sample_weights) #贝叶斯分类器 , logic分类器 - # clf.fit(X_train_scaled, y_train) #SVM, xgb, lgbm, catboost分类器 - print(f"训练完成") - - y_pred = clf.predict(X_val_scaled) - y_prob = clf.predict_proba(X_val_scaled)[:, -1] - - ACC, Recall, Specificity, Precision, NPV, roc_auc = calculate_metrics(y_test, y_pred, y_prob, save_roc_path=save_path, mode=mode) - - acc_scores.append(ACC) - recall_scores.append(Recall) - specificity_scores.append(Specificity) - precision_scores.append(Precision) - npv_scores.append(NPV) - auc_scores.append(roc_auc) - selected_features_all.append(f"number: {len(selected_features)}, features: {selected_features}\n") - - final_ACC = np.mean(acc_scores) - final_Recall = np.mean(recall_scores) - final_Specificity = np.mean(specificity_scores) - final_Precision = np.mean(precision_scores) - final_NPV = np.mean(npv_scores) - final_AUC = np.mean(auc_scores) - print("\n最终测试集的具体指标值:") - print(f"准确率 (ACC): {final_ACC:.3f} ± {np.std(acc_scores):.3f}") - print(f"召回率 (Recall): {final_Recall:.3f} ± {np.std(recall_scores):.3f}") - print(f"特异性 (Specificity): {final_Specificity:.3f} ± {np.std(specificity_scores):.3f}") - print(f"精确率 (PPV): {final_Precision:.3f} ± {np.std(precision_scores):.3f}") - print(f"阴性预测值 (NPV): {final_NPV:.3f} ± {np.std(npv_scores):.3f}") - print(f"AUC值: {final_AUC:.3f} ± {np.std(auc_scores):.3f}") - print(f"classifier is: {mode}") - - final_results = { - 'Recall': f"{final_Recall:.3f} ± {np.std(recall_scores):.3f}", - 'Specificity': f"{final_Specificity:.3f} ± {np.std(specificity_scores):.3f}", - 'ACC': f"{final_ACC:.3f} ± {np.std(acc_scores):.3f}", - 'PPV': f"{final_Precision:.3f} ± {np.std(precision_scores):.3f}", - 'NPV': f"{final_NPV:.3f} ± {np.std(npv_scores):.3f}", - 'AUC': f"{final_AUC:.3f} ± {np.std(auc_scores):.3f}\n", - } - for metric, value in final_results.items(): - results += f"\n{metric}: {value}\n" - for feature in selected_features_all: - results += f"{feature}\n" - results += "\n\n" - for pm in parameter_clf: - results += "\nparameter: {} \n{} values is: {}\n".format(pm, pm, parameter_clf[pm]) - save_results(results, result_folder) +from config.cli import main if __name__ == "__main__": - main() \ No newline at end of file + main(default_profile="lab1") diff --git a/code/cls/classify_single.py b/code/cls/classify_single.py index 045d5b9..59abec2 100644 --- a/code/cls/classify_single.py +++ b/code/cls/classify_single.py @@ -1,335 +1,6 @@ -import pandas as pd -from sklearn.naive_bayes import GaussianNB -from sklearn.preprocessing import StandardScaler -import numpy as np -from sklearn.metrics import roc_auc_score, confusion_matrix -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import StratifiedKFold -import os -from datetime import datetime -from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score -from utils.pre4data import lasso_dimension_reduction, if_same -import xgboost as xgb -import matplotlib.pyplot as plt -from sklearn.svm import SVC -from lightgbm import LGBMClassifier -from catboost import CatBoostClassifier -from sklearn.model_selection import StratifiedShuffleSplit -from sklearn.model_selection import train_test_split +"""Single-table stratified split (profile: single).""" -def get_next_result_folder(base_path='D:/PycharmProject/classification/results_nse'): - if not os.path.exists(base_path): - os.makedirs(base_path) - return os.path.join(base_path, 'results_1') - - # 查找现有的results_i文件夹 - existing_folders = [d for d in os.listdir(base_path) - if os.path.isdir(os.path.join(base_path, d)) - and d.startswith('results_')] - - if not existing_folders: - return os.path.join(base_path, 'results_1') - - # 获取现有文件夹的最大编号 - max_num = max([int(f.split('_')[1]) for f in existing_folders]) - - # 返回下一个编号的文件夹路径 - return os.path.join(base_path, f'results_{max_num + 1}') - -def save_results(results_text, result_folder): - """保存结果到results.txt文件""" - os.makedirs(result_folder, exist_ok=True) - - result_file = os.path.join(result_folder, 'results.txt') - - timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - results_with_timestamp = f"实验时间: {timestamp}\n\n{results_text}" - - with open(result_file, 'w', encoding='utf-8') as f: - f.write(results_with_timestamp) - - print(f"\n结果已保存到: {result_file}") - -def calculate_metrics(y_true, y_pred, y_prob, save_roc_path=None, mode=None, fold=None): - tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() - - ACC = (tp + tn) / (tp + tn + fp + fn) - Recall = tp / (tp + fn) if (tp + fn) != 0 else 0 - Specificity = tn / (tn + fp) if (tn + fp) != 0 else 0 - Precision = tp / (tp + fp) if (tp + fp) != 0 else 0 - NPV = tn / (tn + fn) if (tn + fn) != 0 else 0 - # ROC曲线绘制逻辑 - roc_auc = None - if y_prob is not None: - roc_auc = roc_auc_score(y_true, y_prob) - - # 内置绘制ROC曲线 - fpr, tpr, _ = roc_curve(y_true, y_prob) - plt.figure() - plt.plot(fpr, tpr, color='darkorange', lw=2, - label=f'ROC curve (AUC = {roc_auc:.2f})') - plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') - plt.xlim([0.0, 1.0]) - plt.ylim([0.0, 1.05]) - plt.xlabel('False Positive Rate') - plt.ylabel('True Positive Rate') - plt.title('Receiver Operating Characteristic (ROC)') - plt.legend(loc="lower right") - - if fold: - plt.savefig(os.path.join(save_roc_path, 'model-{}_fold-{}'.format(mode, fold)), dpi=300, bbox_inches='tight') - else: - plt.savefig(os.path.join(save_roc_path, 'model-{}'.format(mode)), dpi=300, bbox_inches='tight') - plt.close() - - - roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None - return ACC, Recall, Specificity, Precision, NPV, roc_auc - - -def balanced_train_test_split(X, y, test_size=0.4, random_state=None): - print("初始各类别比例 :", np.unique(y, return_counts=True)[1]/len(y)) - sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) - for train_index, val_index in sss.split(X, y): - X_train = X.iloc[train_index] - X_val = X.iloc[val_index] - y_train = y.iloc[train_index] - y_val = y.iloc[val_index] - - # 验证各类别比例是否相同 - print("验证集各类别比例:", np.unique(y_val, return_counts=True)[1]/len(y_val)) - print("训练集各类别比例:", np.unique(y_train, return_counts=True)[1]/len(y_train)) - - return X_train, X_val, y_train, y_val - - -def main(): - data_train_path = './data/0804data_0.xlsx' - ct_mode = data_train_path.split('data')[0].split('/')[-1] - data_train = pd.read_excel(data_train_path) - lab_describe = 'cpc1-2=0_cpc3-5=1' - - train_df1, test_df1 = train_test_split( - data_train, - test_size=0.2, # 20% 作为测试集 - stratify=data_train['CPC'], # 分层抽样 - random_state=42) - - # without cpc5 -------> means dead people data - # train_df1 = train_df1[train_df1['CPC'] != 5] - # test_df1 = test_df1[test_df1['CPC'] != 5] - - # ============================================= set dataset cpc split ============================================= - # CPC 1-2 --> label 0, CPC3-5 --> label 1 - train_df1['label'] = pd.cut(train_df1['CPC'], bins=[0, 2, 5], labels=[0, 1]) - test_df1['label'] = pd.cut(test_df1['CPC'], bins=[0, 2, 5], labels=[0, 1]) - - # # # CPC 1-4 --> label 0, CPC5 --> label 1 - # train_df1['label'] = pd.cut(train_df1['CPC'], bins=[0, 4, 5], labels=[0, 1]) - # test_df1['label'] = pd.cut(test_df1['CPC'], bins=[0, 4, 5], labels=[0, 1]) - - # # CPC 1-2 --> label 0, CPC3-4 --> label 1 - # train_df1['label'] = pd.cut(train_df1['CPC'], bins=[0, 2, 4], labels=[0, 1]) - # test_df1['label'] = pd.cut(test_df1['CPC'], bins=[0, 2, 4], labels=[0, 1]) - # ============================================= set dataset cpc split ============================================= - - train_df1 = train_df1.drop(['CPC'], axis=1) - test_df1 = test_df1.drop(['CPC'], axis=1) - - # dataset - label1_num = len(data_train[data_train['CPC'] == 1]) - label2_num = len(data_train[data_train['CPC'] == 2]) - label3_num = len(data_train[data_train['CPC'] == 3]) - label4_num = len(data_train[data_train['CPC'] == 4]) - label5_num = len(data_train[data_train['CPC'] == 5]) - print(f"训练集形状: {train_df1.shape}, 训练集时间: {ct_mode}") - result_folder = get_next_result_folder(base_path='./results_0728_delete_new') - - y_index_s = train_df1.iloc[:, -1] - scale_pos_weight = len(y_index_s[y_index_s == 0]) / len(y_index_s[y_index_s == 1]) - scaler = StandardScaler() - print(f"数据划分完毕") - - # 初始化分类器 - ratio0 = len([x for x in data_train['label'].tolist() if x == 0]) / len(data_train['label'].tolist()) - ratio1 = len([x for x in data_train['label'].tolist() if x == 1]) / len(data_train['label'].tolist()) - clf = GaussianNB(priors=[ratio0, ratio1]) # priors=[0.5, 0.5] - clf = LogisticRegression() - - lgbm_params = { - 'objective': 'binary', # 二分类任务 - 'metric': 'binary_logloss', # 使用logloss作为评价指标 - 'learning_rate': 0.016, - 'max_depth': 6, - 'n_estimators': 500, - 'subsample': 0.7, # 构建每棵树时使用的样本比例 - 'colsample_bytree': 0.7, # 每棵树使用的特征比例 - 'scale_pos_weight': scale_pos_weight, # 根据数据不平衡调整正负样本的权重 - 'random_state': 42, - } - clf = LGBMClassifier(**lgbm_params) - - catboost_params = { - 'iterations': 500, # 迭代次数 - 'depth': 6, # 树的深度 - 'learning_rate': 0.01, # 学习率 - 'loss_function': 'Logloss', # 损失函数 - 'eval_metric': 'AUC', # AUC作为评价指标 - 'scale_pos_weight': scale_pos_weight, # 样本不平衡的调整 - 'random_seed': 42, - 'verbose': 0 # 不输出训练过程 - } - clf = CatBoostClassifier(**catboost_params) - - xgb_params = { - 'objective': 'binary:logistic', 'eval_metric': ['logloss'], - 'learning_rate': 0.016, - 'max_depth': 6, - 'n_estimators': 600, - 'subsample': 0.72, # 用于构建每棵树的样本比例 - 'colsample_bytree': 0.705, # 控制每棵树在构建时使用的特征比例 - # 'scale_pos_weight': 1.4, # 根据实际正负样本比例设置权重 len(y[y==0]) / len(y[y==1]), - 'gamma': 0.1, - 'min_child_weight': 1, # 降低以增加模型灵活性 - 'scale_pos_weight': scale_pos_weight, - 'random_state': 42, - # 'tree_method': 'hist', - # 'device': 'cuda', - } - clf = xgb.XGBClassifier(**xgb_params) - - clf = SVC( - kernel='rbf', # 使用RBF核函数 - C=10, # 正则化参数 - gamma='auto', # scale - probability=True, # 启用概率估计 - class_weight='balanced', # 处理类别不平衡 - random_state=42) - - # 参数网格 - param_grid = { - 'learning_rate': [0.01, 0.05, 0.1], - 'max_depth': [5, 6, 7], - 'n_estimators': [100, 200, 500], - 'subsample': [0.8, 0.9, 1.0], - 'colsample_bytree': [0.8, 0.9, 1.0], - } - mode = clf.__class__.__name__ - parameter_clf = clf.get_params() - print(f"classifier is: {mode}") - acc_scores = [] - recall_scores = [] - specificity_scores = [] - precision_scores = [] - npv_scores = [] - auc_scores = [] - selected_features_all = [] - save_path = './results/roc_curve_{}_time_{}_{}_{}'.format(mode, ct_mode, lab_describe, datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) - os.makedirs(save_path, exist_ok=True) - - metrics_history = { - 'ACC': [], 'Recall': [], 'Specificity': [], 'Precision': [], 'NPV': [], 'AUC': [], - } - random_states = 1307 - results = "lab: {}\nclassifier mode is: {}\n\nrandom_states: {}, train num: {}, test num: {}\n".format(lab_describe, mode, random_states, len(train_df1), len(test_df1)) - X_train = train_df1.drop('label', axis=1) - y_train = train_df1['label'] - X_test = test_df1.drop('label', axis=1) - y_test = test_df1['label'] - # X_train, X_val, y_train, y_val = train_test_split( - # X, y, - # test_size=0.4, # 40% 验证集, 76 val 113 train - # stratify=y, - # shuffle=True, - # random_state=random_states[fold-1] - # ) - # X_train, X_val, y_train, y_val = balanced_train_test_split( - # X, y, - # test_size=0.4, # 40% 验证集, 76 val 113 train - # random_state=random_states - # ) - # train_index = X_train.index - # lasso_train = data_train.iloc[train_index] - results += "\ntrain index: \n{}\ntest index: \n{}\n".format(X_train.index.tolist(), X_test.index.tolist()) - results += f"train: label 0 num: {y_train.values.tolist().count(0)} ratio: {y_train.values.tolist().count(0) / len(y_train):.4f}, label 1 num: {y_train.values.tolist().count(1)} ratio: {y_train.values.tolist().count(1) / len(y_train):.4f}\n" - results += f"test : label 0 num: {y_test.values.tolist().count(0)} ratio: {y_test.values.tolist().count(0) / len(y_test):.4f}, label 1 num: {y_test.values.tolist().count(1)} ratio: {y_test.values.tolist().count(1) / len(y_test):.4f}\n\n\n" - - # 特征降维 - print(f"data train shape is : {X_train.shape}") - data_train_lasso, selected_features, best_alphas = lasso_dimension_reduction(train_df1) - X_train = data_train_lasso.iloc[:, :-1] - y_train = data_train_lasso.iloc[:, -1] - print(f"data train shape is : {X_train.shape}") - - - print(f"data selected features num is : {len(selected_features)}") - X_test = X_test[selected_features] - - t = if_same(X_train, X_test) - if t: - print(f"X_train and X_val is same") - else: - print(f"X_train is not same as X_val") - print(f"X_test.shape: {X_test.shape}, X_train.shape: {X_train.shape}") - - # 标准化 - X_train_scaled = scaler.fit_transform(X_train) - X_val_scaled = scaler.transform(X_test) - - sample_weights = np.ones(len(y_train)) - sample_weights[y_train == 0] = len(y_train) / (2 * (y_train == 0).sum()) - sample_weights[y_train == 1] = len(y_train) / (2 * (y_train == 1).sum()) - - print(f"训练开始") - clf.fit(X_train_scaled, y_train, sample_weight=sample_weights) #贝叶斯分类器 , logic分类器 - # clf.fit(X_train_scaled, y_train) #SVM, xgb, lgbm, catboost分类器 - print(f"训练完成") - - y_pred = clf.predict(X_val_scaled) - y_prob = clf.predict_proba(X_val_scaled)[:, -1] - - ACC, Recall, Specificity, Precision, NPV, roc_auc = calculate_metrics(y_test, y_pred, y_prob, save_roc_path=save_path, mode=mode) - - acc_scores.append(ACC) - recall_scores.append(Recall) - specificity_scores.append(Specificity) - precision_scores.append(Precision) - npv_scores.append(NPV) - auc_scores.append(roc_auc) - selected_features_all.append(f"number: {len(selected_features)}, features: {selected_features}\n") - - final_ACC = np.mean(acc_scores) - final_Recall = np.mean(recall_scores) - final_Specificity = np.mean(specificity_scores) - final_Precision = np.mean(precision_scores) - final_NPV = np.mean(npv_scores) - final_AUC = np.mean(auc_scores) - print("\n最终测试集的具体指标值:") - print(f"准确率 (ACC): {final_ACC:.3f} ± {np.std(acc_scores):.3f}") - print(f"召回率 (Recall): {final_Recall:.3f} ± {np.std(recall_scores):.3f}") - print(f"特异性 (Specificity): {final_Specificity:.3f} ± {np.std(specificity_scores):.3f}") - print(f"精确率 (PPV): {final_Precision:.3f} ± {np.std(precision_scores):.3f}") - print(f"阴性预测值 (NPV): {final_NPV:.3f} ± {np.std(npv_scores):.3f}") - print(f"AUC值: {final_AUC:.3f} ± {np.std(auc_scores):.3f}") - - final_results = { - 'Recall': f"{final_Recall:.3f} ± {np.std(recall_scores):.3f}", - 'Specificity': f"{final_Specificity:.3f} ± {np.std(specificity_scores):.3f}", - 'ACC': f"{final_ACC:.3f} ± {np.std(acc_scores):.3f}", - 'PPV': f"{final_Precision:.3f} ± {np.std(precision_scores):.3f}", - 'NPV': f"{final_NPV:.3f} ± {np.std(npv_scores):.3f}", - 'AUC': f"{final_AUC:.3f} ± {np.std(auc_scores):.3f}\n", - } - # results = "classifier mode is: {}\n\ntrain index: \n{}\n\ntest index: \n{}".format(mode, X_train.index, X_val.index) - results += "all dataset label 1 num: {}, label 2 num: {}, label 3 num: {}, label 4 num: {}, label 5 num: {}\n".format(label1_num, label2_num, label3_num, label4_num, label5_num) - for metric, value in final_results.items(): - results += f"\n{metric}: {value}\n" - for feature in selected_features_all: - results += f"{feature}\n" - results += "\n\n" - for pm in parameter_clf: - results += "\nparameter: {} \n{} values is: {}\n".format(pm, pm, parameter_clf[pm]) - save_results(results, result_folder) +from config.cli import main if __name__ == "__main__": - main() \ No newline at end of file + main(default_profile="single") diff --git a/code/cls/config/__init__.py b/code/cls/config/__init__.py new file mode 100644 index 0000000..70f87f2 --- /dev/null +++ b/code/cls/config/__init__.py @@ -0,0 +1,16 @@ +from config.defaults import ( + CPC_LABEL_BINS, + ID_COLUMNS, + NSE_COLUMNS, + ExperimentConfig, +) +from config.load_config import list_profiles, load_experiment_config + +__all__ = [ + "CPC_LABEL_BINS", + "ID_COLUMNS", + "NSE_COLUMNS", + "ExperimentConfig", + "list_profiles", + "load_experiment_config", +] diff --git a/code/cls/config/cli.py b/code/cls/config/cli.py new file mode 100644 index 0000000..6154bde --- /dev/null +++ b/code/cls/config/cli.py @@ -0,0 +1,65 @@ +"""CLI entry for running experiments from YAML profiles.""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +from config.load_config import list_profiles, load_experiment_config + + +def build_parser(*, default_profile: str) -> argparse.ArgumentParser: + env_default = os.environ.get("CLS_EXPERIMENT", default_profile) + parser = argparse.ArgumentParser( + description="Run CPC classification experiment from a YAML profile.", + ) + parser.add_argument( + "--profile", + "-p", + default=env_default, + help=f"Experiment profile name (default: {env_default!r}, env CLS_EXPERIMENT)", + ) + parser.add_argument( + "--config", + "-c", + type=Path, + default=None, + help="Path to experiments.yaml (default: config/experiments.yaml)", + ) + parser.add_argument( + "--list-profiles", + action="store_true", + help="List available profile names and exit", + ) + return parser + + +def main(*, default_profile: str = "single") -> None: + parser = build_parser(default_profile=default_profile) + args = parser.parse_args() + + if args.list_profiles: + for name in list_profiles(args.config): + print(name) + return + + try: + config = load_experiment_config(args.profile, args.config) + except (FileNotFoundError, KeyError) as exc: + print(exc, file=sys.stderr) + sys.exit(1) + + print(f"Profile: {args.profile}") + print(f" train: {config.train_path}") + print(f" test: {config.test_path}") + print(f" use_nse={config.use_nse}, n_folds={config.n_folds}, classifier={config.classifier}") + + from pipeline.experiment import run_experiment + + run_experiment(config) + + +if __name__ == "__main__": + main() diff --git a/code/cls/config/defaults.py b/code/cls/config/defaults.py new file mode 100644 index 0000000..b06f794 --- /dev/null +++ b/code/cls/config/defaults.py @@ -0,0 +1,64 @@ +"""Shared defaults for CPC classification experiments.""" + +from __future__ import annotations + +from dataclasses import dataclass, field, fields +from pathlib import Path +from typing import Literal, Sequence + +# CPC 1–2 → 0, CPC 3–5 → 1 (good vs poor outcome) +CPC_LABEL_BINS: tuple[int, int, int] = (0, 2, 5) +CPC_LABELS: tuple[int, int] = (0, 1) + +ID_COLUMNS: Sequence[str] = ("CTid", "name") +NSE_COLUMNS: Sequence[str] = ("nse极值", "nse极值差") + +ClassifierName = Literal[ + "svm", + "logistic", + "gaussian_nb", + "xgboost", + "lightgbm", + "catboost", +] + + +@dataclass +class ExperimentConfig: + """Runtime settings for a single training/evaluation run.""" + + train_path: Path | str + test_path: Path | str | None = None + results_base_dir: Path | str = "./results" + lab_describe: str = "cpc1-2=0_cpc3-5=1" + use_nse: bool = False + classifier: ClassifierName = "svm" + random_state: int = 42 + # K-fold mode: when test_path is None and n_folds > 1, split from train_path + n_folds: int = 1 + fold_random_states: Sequence[int] = field( + default_factory=lambda: (3, 13, 42, 87, 1307) + ) + test_size: float = 0.2 + exclude_cpc5: bool = False + cpc_bins: tuple[int, int, int] = CPC_LABEL_BINS + cpc_labels: tuple[int, int] = CPC_LABELS + + def resolve_paths(self) -> None: + self.train_path = Path(self.train_path).expanduser() + if self.test_path is not None: + self.test_path = Path(self.test_path).expanduser() + self.results_base_dir = Path(self.results_base_dir).expanduser() + + @classmethod + def from_mapping(cls, data: dict) -> ExperimentConfig: + """Build config from a YAML profile dict (unknown keys ignored).""" + field_names = {f.name for f in fields(cls)} + kwargs = {k: v for k, v in data.items() if k in field_names and v is not None} + if "fold_random_states" in kwargs: + kwargs["fold_random_states"] = tuple(kwargs["fold_random_states"]) + if "cpc_bins" in kwargs: + kwargs["cpc_bins"] = tuple(kwargs["cpc_bins"]) + if "cpc_labels" in kwargs: + kwargs["cpc_labels"] = tuple(kwargs["cpc_labels"]) + return cls(**kwargs) diff --git a/code/cls/config/experiments.yaml b/code/cls/config/experiments.yaml new file mode 100644 index 0000000..f9d864a --- /dev/null +++ b/code/cls/config/experiments.yaml @@ -0,0 +1,62 @@ +# 实验配置:路径与超参外置 +# +# 路径占位符: +# {data_root}、{results_root} — 来自下方 paths(支持环境变量展开) +# ${CLS_DATA_ROOT:-./data} — 直接写环境变量亦可 +# +# 环境变量(推荐): +# CLS_DATA_ROOT 数据目录(Excel) +# CLS_RESULTS_ROOT 结果输出根目录 +# CLS_EXPERIMENT 默认 profile 名(可被 --profile 覆盖) +# +# 示例(Windows): +# set CLS_DATA_ROOT=D:\thrid_beijing_hospital_data +# set CLS_RESULTS_ROOT=D:\thrid_beijing_hospital_data + +paths: + data_root: ${CLS_DATA_ROOT:-./data} + results_root: ${CLS_RESULTS_ROOT:-./results} + +experiments: + lab1: + train_path: "{data_root}/0804lab1-train.xlsx" + test_path: "{data_root}/0804lab1-test.xlsx" + results_base_dir: "{results_root}/results_0804_lab1" + lab_describe: cpc1-2=0_cpc3-5=1_lab1 + use_nse: true + classifier: svm + n_folds: 1 + random_state: 42 + + single: + train_path: "{data_root}/0804data_0.xlsx" + test_path: null + results_base_dir: "{results_root}/results_single" + lab_describe: cpc1-2=0_cpc3-5=1 + use_nse: false + classifier: svm + n_folds: 1 + test_size: 0.2 + random_state: 42 + + kfold: + train_path: "{data_root}/0728data_delete.xlsx" + test_path: null + results_base_dir: "{results_root}/results_kfold" + lab_describe: cpc1-2=0_cpc3-5=1_kfold + use_nse: false + classifier: svm + n_folds: 5 + test_size: 0.2 + fold_random_states: [3, 13, 42, 87, 1307] + + kfold_nse: + train_path: "{data_root}/0804lab1-CTdata_withCTidname_nse.xlsx" + test_path: null + results_base_dir: "{results_root}/results_kfold_nse" + lab_describe: cpc1-2=0_cpc3-5=1_kfold_nse + use_nse: true + classifier: svm + n_folds: 5 + test_size: 0.2 + fold_random_states: [3, 13, 42, 87, 1307] diff --git a/code/cls/config/feature_columns.py b/code/cls/config/feature_columns.py new file mode 100644 index 0000000..2729000 --- /dev/null +++ b/code/cls/config/feature_columns.py @@ -0,0 +1,72 @@ +"""Columns to drop from PyRadiomics exports before modeling.""" + +RADIOMICS_DROP_COLUMNS: list[str] = [ + "diagnostics_Image-original_Hash", + "diagnostics_Imag e-original_Hash_1", + "diagnostics_Image-original_Hash_2", + "diagnostics_Image-original_Hash_3", + "diagnostics_Mask-original_Hash", + "diagnostics_Mask-original_Hash_1", + "diagnostics_Mask-original_Hash_2", + "diagnostics_Mask-original_Hash_3", + "diagnostics_Image-original_Spacing", + "diagnostics_Image-original_Spacing_1", + "diagnostics_Image-original_Spacing_2", + "diagnostics_Image-original_Spacing_3", + "diagnostics_Image-original_Size", + "diagnostics_Image-original_Size_1", + "diagnostics_Image-original_Size_2", + "diagnostics_Image-original_Size_3", + "diagnostics_Mask-original_Spacing", + "diagnostics_Mask-original_Spacing_1", + "diagnostics_Mask-original_Spacing_2", + "diagnostics_Mask-original_Spacing_3", + "diagnostics_Mask-original_Size", + "diagnostics_Mask-original_Size_1", + "diagnostics_Mask-original_Size_2", + "diagnostics_Mask-original_Size_3", + "diagnostics_Mask-original_BoundingBox", + "diagnostics_Mask-original_BoundingBox_1", + "diagnostics_Mask-original_BoundingBox_2", + "diagnostics_Mask-original_BoundingBox_3", + "diagnostics_Mask-original_CenterOfMassIndex", + "diagnostics_Mask-original_CenterOfMassIndex_1", + "diagnostics_Mask-original_CenterOfMassIndex_2", + "diagnostics_Mask-original_CenterOfMassIndex_3", + "diagnostics_Mask-original_CenterOfMass", + "diagnostics_Mask-original_CenterOfMass_1", + "diagnostics_Mask-original_CenterOfMass_2", + "diagnostics_Mask-original_CenterOfMass_3", + "diagnostics_Mask-original_BoundingBox.1", + "diagnostics_Mask-original_BoundingBox.1_1", + "diagnostics_Mask-original_BoundingBox.1_2", + "diagnostics_Mask-original_BoundingBox.1_3", + "CPC_1", + "CPC_2", + "CPC_3", + "CTid_1", + "CTid_2", + "CTid_3", + "name_1", + "name_2", + "name_3", + "diagnostics_Mask-corrected_Spacing", + "diagnostics_Mask-corrected_Size", + "diagnostics_Mask-corrected_BoundingBox", + "diagnostics_Mask-corrected_VoxelNum", + "diagnostics_Mask-corrected_VolumeNum", + "diagnostics_Mask-corrected_CenterOfMassIndex", + "diagnostics_Mask-corrected_CenterOfMass", + "diagnostics_Mask-corrected_Mean", + "diagnostics_Mask-corrected_Minimum", + "diagnostics_Mask-corrected_Maximum", + "live", + "diagnostics_Versions_PyRadiomics", + "diagnostics_Versions_Numpy", + "diagnostics_Versions_SimpleITK", + "diagnostics_Versions_PyWavelet", + "diagnostics_Versions_Python", + "diagnostics_Configuration_Settings", + "diagnostics_Configuration_EnabledImageTypes", + "diagnostics_Image-original_Dimensionality", +] diff --git a/code/cls/config/load_config.py b/code/cls/config/load_config.py new file mode 100644 index 0000000..7995274 --- /dev/null +++ b/code/cls/config/load_config.py @@ -0,0 +1,89 @@ +"""Load ExperimentConfig from YAML profiles and environment variables.""" + +from __future__ import annotations + +import os +import re +from pathlib import Path +from typing import Any + +import yaml + +from config.defaults import ExperimentConfig + +_ENV_PATTERN = re.compile( + r"\$\{([^}:]+)(?::-([^}]*))?\}" +) + + +def _expand_env_string(value: str) -> str: + """Replace ``${VAR}`` and ``${VAR:-default}`` in a string.""" + + def repl(match: re.Match[str]) -> str: + name, default = match.group(1), match.group(2) + if name in os.environ: + return os.environ[name] + if default is not None: + return default + return "" + + return _ENV_PATTERN.sub(repl, value) + + +def _resolve_string(value: str, context: dict[str, str]) -> str: + expanded = _expand_env_string(value) + return expanded.format(**context) + + +def _resolve_value(value: Any, context: dict[str, str]) -> Any: + if isinstance(value, str): + return _resolve_string(value, context) + if isinstance(value, dict): + return {k: _resolve_value(v, context) for k, v in value.items()} + if isinstance(value, list): + return [_resolve_value(v, context) for v in value] + return value + + +def _default_config_path() -> Path: + return Path(__file__).resolve().parent / "experiments.yaml" + + +def list_profiles(config_path: Path | None = None) -> list[str]: + path = config_path or _default_config_path() + with path.open(encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + experiments = data.get("experiments", {}) + return sorted(experiments.keys()) + + +def load_experiment_config( + profile: str, + config_path: Path | str | None = None, +) -> ExperimentConfig: + """ + Load one experiment profile from YAML. + + Raises: + FileNotFoundError: config file missing + KeyError: unknown profile name + """ + path = Path(config_path) if config_path else _default_config_path() + if not path.is_file(): + raise FileNotFoundError(f"Config not found: {path}") + + with path.open(encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + + experiments = data.get("experiments") or {} + if profile not in experiments: + available = ", ".join(sorted(experiments.keys())) or "(none)" + raise KeyError(f"Unknown profile {profile!r}. Available: {available}") + + raw_paths = data.get("paths") or {} + context: dict[str, str] = {} + for key, raw in raw_paths.items(): + context[key] = _resolve_string(str(raw), context) + + resolved = _resolve_value(experiments[profile], context) + return ExperimentConfig.from_mapping(resolved) diff --git a/code/cls/data/.gitkeep b/code/cls/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/code/cls/models/__init__.py b/code/cls/models/__init__.py new file mode 100644 index 0000000..a8c710f --- /dev/null +++ b/code/cls/models/__init__.py @@ -0,0 +1,3 @@ +from models.factory import build_classifier + +__all__ = ["build_classifier"] diff --git a/code/cls/models/factory.py b/code/cls/models/factory.py new file mode 100644 index 0000000..1367568 --- /dev/null +++ b/code/cls/models/factory.py @@ -0,0 +1,80 @@ +"""Classifier construction with shared hyperparameters.""" + +from __future__ import annotations + +from catboost import CatBoostClassifier +from lightgbm import LGBMClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import GaussianNB +from sklearn.svm import SVC +import xgboost as xgb + +from config.defaults import ClassifierName + + +def build_classifier( + name: ClassifierName, + *, + scale_pos_weight: float, + class_priors: tuple[float, float] | None = None, + random_state: int = 42, +): + """Return a fresh sklearn-compatible classifier instance.""" + if name == "gaussian_nb": + priors = list(class_priors) if class_priors else [0.5, 0.5] + return GaussianNB(priors=priors) + + if name == "logistic": + return LogisticRegression(random_state=random_state) + + if name == "lightgbm": + return LGBMClassifier( + objective="binary", + metric="binary_logloss", + learning_rate=0.016, + max_depth=6, + n_estimators=500, + subsample=0.7, + colsample_bytree=0.7, + scale_pos_weight=scale_pos_weight, + random_state=random_state, + ) + + if name == "catboost": + return CatBoostClassifier( + iterations=500, + depth=6, + learning_rate=0.01, + loss_function="Logloss", + eval_metric="AUC", + scale_pos_weight=scale_pos_weight, + random_seed=random_state, + verbose=0, + ) + + if name == "xgboost": + return xgb.XGBClassifier( + objective="binary:logistic", + eval_metric=["logloss"], + learning_rate=0.016, + max_depth=6, + n_estimators=600, + subsample=0.72, + colsample_bytree=0.705, + gamma=0.1, + min_child_weight=1, + scale_pos_weight=scale_pos_weight, + random_state=random_state, + ) + + if name == "svm": + return SVC( + kernel="rbf", + C=10, + gamma="auto", + probability=True, + class_weight="balanced", + random_state=random_state, + ) + + raise ValueError(f"Unknown classifier: {name!r}") diff --git a/code/cls/new_data_process.py b/code/cls/new_data_process.py index 733cc52..2565b46 100644 --- a/code/cls/new_data_process.py +++ b/code/cls/new_data_process.py @@ -1,6 +1,7 @@ import pandas as pd import os +from config.feature_columns import RADIOMICS_DROP_COLUMNS from utils.pre4data import drop_columns from sklearn.model_selection import train_test_split @@ -30,29 +31,8 @@ # data_1 = pd.concat([data0_1, data1_1, data2_1, data3_1]) data_1 = pd.concat([data0_1, data1_1, data2_1]) -# dropdata and dropdata0 是不需要的特征, 且这些特征在进行降维时的值会导致无法读取, 字符串类型或者其他不可使用的类型 -dropdata = ['diagnostics_Image-original_Hash', 'diagnostics_Imag e-original_Hash_1', 'diagnostics_Image-original_Hash_2', 'diagnostics_Image-original_Hash_3', - 'diagnostics_Mask-original_Hash', 'diagnostics_Mask-original_Hash_1', 'diagnostics_Mask-original_Hash_2', 'diagnostics_Mask-original_Hash_3', - 'diagnostics_Image-original_Spacing', 'diagnostics_Image-original_Spacing_1', 'diagnostics_Image-original_Spacing_2','diagnostics_Image-original_Spacing_3', - 'diagnostics_Image-original_Size', 'diagnostics_Image-original_Size_1', 'diagnostics_Image-original_Size_2', 'diagnostics_Image-original_Size_3', - 'diagnostics_Mask-original_Spacing', 'diagnostics_Mask-original_Spacing_1', 'diagnostics_Mask-original_Spacing_2', 'diagnostics_Mask-original_Spacing_3', - 'diagnostics_Mask-original_Size', 'diagnostics_Mask-original_Size_1', 'diagnostics_Mask-original_Size_2', 'diagnostics_Mask-original_Size_3', - 'diagnostics_Mask-original_BoundingBox', 'diagnostics_Mask-original_BoundingBox_1', 'diagnostics_Mask-original_BoundingBox_2', 'diagnostics_Mask-original_BoundingBox_3', - 'diagnostics_Mask-original_CenterOfMassIndex', 'diagnostics_Mask-original_CenterOfMassIndex_1', 'diagnostics_Mask-original_CenterOfMassIndex_2', 'diagnostics_Mask-original_CenterOfMassIndex_3', - 'diagnostics_Mask-original_CenterOfMass', 'diagnostics_Mask-original_CenterOfMass_1', 'diagnostics_Mask-original_CenterOfMass_2', 'diagnostics_Mask-original_CenterOfMass_3', - 'diagnostics_Mask-original_BoundingBox.1', 'diagnostics_Mask-original_BoundingBox.1_1', 'diagnostics_Mask-original_BoundingBox.1_2', 'diagnostics_Mask-original_BoundingBox.1_3', - 'diagnostics_Mask-original_BoundingBox', 'diagnostics_Mask-original_BoundingBox_1', 'diagnostics_Mask-original_BoundingBox_2', 'diagnostics_Mask-original_BoundingBox_3', - 'CPC_1', 'CPC_2', 'CPC_3', 'CTid_1', 'CTid_2', 'CTid_3', 'name_1', 'name_2', 'name_3', - 'diagnostics_Mask-corrected_Spacing', 'diagnostics_Mask-corrected_Size', 'diagnostics_Mask-corrected_BoundingBox', 'diagnostics_Mask-corrected_VoxelNum', - 'diagnostics_Mask-corrected_VolumeNum','diagnostics_Mask-corrected_CenterOfMassIndex', - 'diagnostics_Mask-corrected_CenterOfMass', 'diagnostics_Mask-corrected_Mean', 'diagnostics_Mask-corrected_Minimum', - 'diagnostics_Mask-corrected_Maximum', 'live', 'diagnostics_Versions_PyRadiomics', 'diagnostics_Versions_Numpy', 'diagnostics_Versions_SimpleITK', 'diagnostics_Versions_PyWavelet', - 'diagnostics_Versions_Python', 'diagnostics_Configuration_Settings', 'diagnostics_Configuration_EnabledImageTypes', 'diagnostics_Image-original_Dimensionality'] - -dropdata0 = ['diagnostics_Mask-corrected_Spacing', 'diagnostics_Mask-corrected_Size', 'diagnostics_Mask-corrected_BoundingBox', - 'diagnostics_Mask-corrected_VoxelNum', 'diagnostics_Mask-corrected_VolumeNum', - 'diagnostics_Mask-corrected_CenterOfMassIndex', 'diagnostics_Mask-corrected_CenterOfMass', - 'diagnostics_Mask-corrected_Mean', 'diagnostics_Mask-corrected_Minimum', 'diagnostics_Mask-corrected_Maximum', 'live'] +# 不需要的 diagnostics / 元数据列(见 config/feature_columns.py) +dropdata = RADIOMICS_DROP_COLUMNS data0_1 = drop_columns(data0_1, dropdata) data1_1 = drop_columns(data1_1, dropdata) diff --git a/code/cls/pipeline/__init__.py b/code/cls/pipeline/__init__.py new file mode 100644 index 0000000..027129f --- /dev/null +++ b/code/cls/pipeline/__init__.py @@ -0,0 +1,8 @@ +from pipeline.data import load_split_tables, prepare_labeled_frames +from pipeline.experiment import run_experiment + +__all__ = [ + "load_split_tables", + "prepare_labeled_frames", + "run_experiment", +] diff --git a/code/cls/pipeline/data.py b/code/cls/pipeline/data.py new file mode 100644 index 0000000..1da118e --- /dev/null +++ b/code/cls/pipeline/data.py @@ -0,0 +1,112 @@ +"""Data loading and CPC label preparation.""" + +from __future__ import annotations + +from pathlib import Path + +import pandas as pd +from sklearn.model_selection import train_test_split + +from config.defaults import CPC_LABEL_BINS, CPC_LABELS, ID_COLUMNS, NSE_COLUMNS + + +def load_excel(path: Path | str) -> pd.DataFrame: + return pd.read_excel(Path(path)) + + +def assign_cpc_labels( + df: pd.DataFrame, + *, + bins: tuple[int, int, int] = CPC_LABEL_BINS, + labels: tuple[int, int] = CPC_LABELS, + exclude_cpc5: bool = False, +) -> pd.DataFrame: + """Add binary ``label`` from ``CPC`` and drop the original column.""" + if "label" in df.columns and "CPC" not in df.columns: + return df.copy() + + out = df.copy() + if exclude_cpc5: + out = out[out["CPC"] != 5] + out["label"] = pd.cut(out["CPC"], bins=list(bins), labels=list(labels)) + return out.drop(columns=["CPC"]) + + +def strip_id_columns( + df: pd.DataFrame, + *, + id_columns: tuple[str, ...] = tuple(ID_COLUMNS), + nse_columns: tuple[str, ...] = tuple(NSE_COLUMNS), + keep_nse: bool = False, +) -> tuple[pd.DataFrame, pd.DataFrame | None]: + """Split feature frame and optional NSE side table.""" + drop_cols = list(id_columns) + nse_df = None + if keep_nse: + present_nse = [c for c in nse_columns if c in df.columns] + if present_nse: + nse_df = df[present_nse].copy() + drop_cols = list(id_columns) + list(nse_columns) + features = df.drop(columns=[c for c in drop_cols if c in df.columns]) + return features, nse_df + + +def prepare_labeled_frames( + train_df: pd.DataFrame, + test_df: pd.DataFrame, + *, + use_nse: bool = False, + exclude_cpc5: bool = False, + bins: tuple[int, int, int] = CPC_LABEL_BINS, + labels: tuple[int, int] = CPC_LABELS, +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame | None, pd.DataFrame | None]: + """Label CPC (if needed), drop metadata columns, optionally extract NSE.""" + train_labeled = assign_cpc_labels( + train_df, bins=bins, labels=labels, exclude_cpc5=exclude_cpc5 + ) + test_labeled = assign_cpc_labels( + test_df, bins=bins, labels=labels, exclude_cpc5=exclude_cpc5 + ) + train_x, train_nse = strip_id_columns(train_labeled, keep_nse=use_nse) + test_x, test_nse = strip_id_columns(test_labeled, keep_nse=use_nse) + return train_x, test_x, train_nse, test_nse + + +def load_split_tables( + train_path: Path | str, + test_path: Path | str | None = None, + *, + test_size: float = 0.2, + random_state: int = 42, + stratify_column: str = "CPC", +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Load fixed train/test files or stratified-split a single table.""" + train_path = Path(train_path) + if test_path is not None: + return load_excel(train_path), load_excel(test_path) + + full = load_excel(train_path) + train_df, test_df = train_test_split( + full, + test_size=test_size, + stratify=full[stratify_column], + random_state=random_state, + ) + return train_df, test_df + + +def merge_nse_features( + train_features: pd.DataFrame, + test_features: pd.DataFrame, + train_nse: pd.DataFrame, + test_nse: pd.DataFrame, + selected_features: list[str], +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Apply LASSO-selected CT features and append NSE columns.""" + train_body = train_features[selected_features].copy() + train_body["label"] = train_features["label"].values + test_body = test_features[selected_features].copy() + train_with_nse = pd.concat([train_body.drop(columns=["label"]), train_nse], axis=1) + train_with_nse["label"] = train_body["label"].values + test_with_nse = pd.concat([test_body, test_nse], axis=1) + return train_with_nse, test_with_nse diff --git a/code/cls/pipeline/experiment.py b/code/cls/pipeline/experiment.py new file mode 100644 index 0000000..9149e13 --- /dev/null +++ b/code/cls/pipeline/experiment.py @@ -0,0 +1,206 @@ +"""End-to-end training and evaluation for one experiment configuration.""" + +from __future__ import annotations + +import os +from datetime import datetime + +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + +from config.defaults import ExperimentConfig +from models.factory import build_classifier +from pipeline.data import ( + load_excel, + load_split_tables, + merge_nse_features, + prepare_labeled_frames, +) +from utils.pre4data import if_same, lasso_dimension_reduction +from utils.util import calculate_metrics, get_next_result_folder, save_results + +METRIC_KEYS = ("ACC", "Recall", "Specificity", "Precision", "NPV", "AUC") + + +def _balanced_sample_weights(y: pd.Series) -> np.ndarray: + weights = np.ones(len(y)) + for label in (0, 1): + mask = y == label + if mask.sum(): + weights[mask] = len(y) / (2 * mask.sum()) + return weights + + +def _fit_classifier(clf, X_train, y_train, sample_weights) -> None: + name = clf.__class__.__name__.lower() + if "gaussiannb" in name or "logisticregression" in name: + clf.fit(X_train, y_train, sample_weight=sample_weights) + else: + clf.fit(X_train, y_train) + + +def _run_single_fold( + train_df: pd.DataFrame, + test_df: pd.DataFrame, + config: ExperimentConfig, + *, + fold: int | None = None, + save_roc_dir: str | None = None, +) -> tuple[dict[str, float], list[str], str]: + train_x, test_x, train_nse, test_nse = prepare_labeled_frames( + train_df, + test_df, + use_nse=config.use_nse, + exclude_cpc5=config.exclude_cpc5, + bins=config.cpc_bins, + labels=config.cpc_labels, + ) + + y_test = test_x["label"] + data_lasso, selected_features, _best_alpha = lasso_dimension_reduction(train_x) + + if config.use_nse and train_nse is not None and test_nse is not None: + train_final, test_final = merge_nse_features( + train_x, test_x, train_nse, test_nse, selected_features + ) + else: + train_final = data_lasso + test_final = test_x.drop(columns=["label"])[selected_features] + + X_train = train_final.drop(columns=["label"]) + y_train = train_final["label"] + + if_same(X_train, test_final) + + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(test_final) + + n0, n1 = (y_train == 0).sum(), (y_train == 1).sum() + scale_pos_weight = float(n0 / max(n1, 1)) + clf = build_classifier( + config.classifier, + scale_pos_weight=scale_pos_weight, + class_priors=((y_train == 0).mean(), (y_train == 1).mean()), + random_state=config.random_state, + ) + + _fit_classifier( + clf, X_train_scaled, y_train, _balanced_sample_weights(y_train) + ) + + y_pred = clf.predict(X_test_scaled) + y_prob = clf.predict_proba(X_test_scaled)[:, -1] + mode = clf.__class__.__name__ + + acc, recall, spec, prec, npv, auc = calculate_metrics( + y_test, + y_pred, + y_prob, + save_roc_path=save_roc_dir, + mode=mode, + fold=fold, + ) + + metrics = { + "ACC": acc, + "Recall": recall, + "Specificity": spec, + "Precision": prec, + "NPV": npv, + "AUC": auc or 0.0, + } + log = ( + f"fold={fold}\n" + f"train size={len(X_train)}, test size={len(test_final)}\n" + f"selected features ({len(selected_features)}): {selected_features}\n" + ) + return metrics, selected_features, log + + +def run_experiment(config: ExperimentConfig) -> str: + """Execute experiment, write results, return results directory path.""" + config.resolve_paths() + lab = ( + f"{config.lab_describe}_withnse" + if config.use_nse + else f"{config.lab_describe}_withoutnse" + ) + + result_folder = get_next_result_folder(base_path=str(config.results_base_dir)) + mode = build_classifier(config.classifier, scale_pos_weight=1.0).__class__.__name__ + ct_mode = config.train_path.stem + save_path = os.path.join( + result_folder, + f"roc_curve_{mode}_time_{ct_mode}_{lab}_{datetime.now():%Y-%m-%d_%H-%M-%S}", + ) + os.makedirs(save_path, exist_ok=True) + + all_metrics: dict[str, list[float]] = {k: [] for k in METRIC_KEYS} + feature_logs: list[str] = [] + split_logs: list[str] = [] + + if config.n_folds > 1: + # Split raw rows (keep CPC + NSE + CTid) so each fold can run full preprocessing. + full_raw = load_excel(config.train_path) + if config.exclude_cpc5: + full_raw = full_raw[full_raw["CPC"] != 5].copy() + + for fold in range(1, config.n_folds + 1): + print(f"============== Fold {fold} ==============") + rs = config.fold_random_states[fold - 1] + train_fold, test_fold = train_test_split( + full_raw, + test_size=config.test_size, + stratify=full_raw["CPC"], + shuffle=True, + random_state=rs, + ) + metrics, features, log = _run_single_fold( + train_fold, + test_fold, + config, + fold=fold, + save_roc_dir=save_path, + ) + for k in METRIC_KEYS: + all_metrics[k].append(metrics[k]) + feature_logs.append(f"fold {fold}: {features}\n") + split_logs.append(log) + else: + train_raw, test_raw = load_split_tables( + config.train_path, + config.test_path, + test_size=config.test_size, + random_state=config.random_state, + ) + metrics, features, log = _run_single_fold( + train_raw, test_raw, config, save_roc_dir=save_path + ) + for k in METRIC_KEYS: + all_metrics[k].append(metrics[k]) + feature_logs.append(str(features)) + split_logs.append(log) + + results = ( + f"lab: {lab}\nclassifier: {mode}\n" + f"random_state: {config.random_state}\nn_folds: {config.n_folds}\n\n" + ) + results += "".join(split_logs) + + for k in METRIC_KEYS: + vals = all_metrics[k] + results += f"\n{k}: {np.mean(vals):.3f} ± {np.std(vals):.3f}\n" + print(f"{k}: {np.mean(vals):.3f} ± {np.std(vals):.3f}") + + for fl in feature_logs: + results += fl + + probe = build_classifier(config.classifier, scale_pos_weight=1.0) + for pm, val in probe.get_params().items(): + results += f"\nparameter {pm}: {val}\n" + + save_results(results, result_folder) + return result_folder diff --git a/code/cls/requirements.txt b/code/cls/requirements.txt new file mode 100644 index 0000000..d122411 --- /dev/null +++ b/code/cls/requirements.txt @@ -0,0 +1,9 @@ +pandas>=2.0.0 +numpy>=1.24.0 +scikit-learn>=1.3.0 +xgboost>=2.0.0 +lightgbm>=4.0.0 +catboost>=1.2.0 +matplotlib>=3.7.0 +openpyxl>=3.1.0 +PyYAML>=6.0.0 diff --git a/code/cls/run_experiment.py b/code/cls/run_experiment.py new file mode 100644 index 0000000..c0ea5c5 --- /dev/null +++ b/code/cls/run_experiment.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +""" +Unified CLI for all experiment profiles. + +Examples: + python run_experiment.py --list-profiles + python run_experiment.py -p lab1 + python run_experiment.py -p kfold_nse --config config/experiments.yaml + CLS_DATA_ROOT=/path/to/data python run_experiment.py -p kfold_nse +""" + +from config.cli import main + +if __name__ == "__main__": + main(default_profile="single") diff --git a/code/cls/utils/__init__.py b/code/cls/utils/__init__.py new file mode 100644 index 0000000..8d33d98 --- /dev/null +++ b/code/cls/utils/__init__.py @@ -0,0 +1 @@ +"""Shared utilities for classification experiments.""" diff --git a/code/cls/utils/util.py b/code/cls/utils/util.py index 6763ba5..1e82581 100644 --- a/code/cls/utils/util.py +++ b/code/cls/utils/util.py @@ -36,8 +36,6 @@ def calculate_metrics(y_true, y_pred, y_prob, save_roc_path=None, mode=None, fol plt.savefig(os.path.join(save_roc_path, 'model-{}'.format(mode)), dpi=300, bbox_inches='tight') plt.close() - - roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None return ACC, Recall, Specificity, Precision, NPV, roc_auc