stap1e · stap1e · May 28, 2026 · May 28, 2026
diff --git a/README.md b/README.md
@@ -0,0 +1,113 @@
+# CPC 预后分类（影像组学 + NSE）
+
+基于 CT 影像组学特征与 NSE 实验室指标，对心脏骤停患者 CPC 评分进行二分类（CPC 1–2 vs 3–5），支持多种机器学习模型与 LASSO 特征筛选。
+
+## 仓库结构
+
+```
+code/cls/
+├── config/           # 实验默认配置（标签分箱、列名等）
+├── models/           # 分类器工厂（统一超参入口）
+├── pipeline/         # 数据准备 + 训练评估主流程
+├── utils/            # 工具函数（LASSO、指标、结果保存）
+│   ├── pre4data.py
+│   ├── util.py
+│   └── tools/        # 数据对齐、NSE 处理等脚本
+├── classify_lab1.py  # 固定 train/test + 可选 NSE（lab1）
+├── classify_single.py# 单文件划分 train/test
+├── classify_kfold.py # 5 折交叉验证（无 NSE）
+├── classify_kfold_nse.py # 5 折 + NSE
+├── run_experiment.py # 统一 CLI（任意 profile）
+├── config/experiments.yaml # 路径与实验配置（外置）
+├── feature_process.py# 特征表合并与清洗（数据预处理）
+├── new_data_process.py
+├── train_test.py     # 划分并导出 train/test Excel
+└── requirements.txt
+```
+
+## 环境
+
+```bash
+cd code/cls
+pip install -r requirements.txt
+```
+
+## 配置路径（YAML + 环境变量）
+
+所有实验参数集中在 `code/cls/config/experiments.yaml`。路径使用占位符，无需改 Python 代码：
+
+| 变量 | 含义 | 默认值 |
+|------|------|--------|
+| `CLS_DATA_ROOT` | Excel 数据目录 | `./data` |
+| `CLS_RESULTS_ROOT` | 结果输出根目录 | `./results` |
+| `CLS_EXPERIMENT` | 默认 profile | 各入口脚本不同 |
+
+YAML 内可使用 `{data_root}/文件名.xlsx`，或 `${CLS_DATA_ROOT:-./data}/文件名.xlsx`。
+
+**Windows 示例**
+
+```bat
+set CLS_DATA_ROOT=D:\thrid_beijing_hospital_data
+set CLS_RESULTS_ROOT=D:\thrid_beijing_hospital_data
+python classify_lab1.py
+```
+
+**Linux / macOS 示例**
+
+```bash
+export CLS_DATA_ROOT=/path/to/your/data
+export CLS_RESULTS_ROOT=/path/to/your/results
+python run_experiment.py -p kfold_nse
+```
+
+查看可用 profile：
+
+```bash
+python run_experiment.py --list-profiles
+```
+
+| Profile | 说明 |
+|---------|------|
+| `lab1` | 固定 train/test，含 NSE |
+| `single` | 单表 8:2 分层划分 |
+| `kfold` | 5 折交叉验证（仅 CT 特征） |
+| `kfold_nse` | 5 折 + NSE（每折独立 LASSO 与 NSE 拼接） |
+
+修改或新增实验：编辑 `config/experiments.yaml` 中 `experiments` 节点，或复制一份自定义 YAML 并用 `-c` 指定。
+
+## 运行实验
+
+在 `code/cls` 目录下执行：
+
+```bash
+pip install -r requirements.txt
+python classify_lab1.py          # 等同: python run_experiment.py -p lab1
+python classify_single.py
+python classify_kfold.py
+python classify_kfold_nse.py       # 5 折 + NSE
+python run_experiment.py -p lab1 -c config/experiments.yaml
+```
+
+### 可选分类器
+
+`classifier` 取值：`svm` | `logistic` | `gaussian_nb` | `xgboost` | `lightgbm` | `catboost`
+
+### 标签定义
+
+默认：CPC 1–2 → 0，CPC 3–5 → 1。可在 `ExperimentConfig` 中调整 `cpc_bins` / `cpc_labels`。
+
+## 设计说明
+
+- **配置与逻辑分离**：路径、模型、是否使用 NSE 等集中在 `ExperimentConfig`，避免在多个脚本中复制数百行训练代码。
+- **统一流水线**：`pipeline.experiment.run_experiment` 负责 LASSO、标准化、训练、指标汇总与结果写入。
+- **可扩展**：新增模型只需在 `models/factory.py` 注册；新增实验类型可复用 `pipeline.data` 与 `_run_single_fold`。
+
+## 数据预处理脚本
+
+| 脚本 | 用途 |
+|------|------|
+| `feature_process.py` | 多时间点影像组学表合并、剔除 diagnostics 列 |
+| `feature_process.py` / `new_data_process.py` | 院内数据流水线（需配置本地 Excel 路径） |
+| `utils/tools/*` | CTid–姓名–NSE 对齐等 |
+
+预处理脚本（`feature_process.py`、`utils/tools/*`）仍可能含历史绝对路径；建议同样改为读取 `CLS_DATA_ROOT` 或从 `config/experiments.yaml` 的 `paths` 段复制路径约定。
diff --git a/code/cls/.gitignore b/code/cls/.gitignore
@@ -0,0 +1,9 @@
+__pycache__/
+*.py[cod]
+.pytest_cache/
+.venv/
+venv/
+results/
+results_*/
+*.xlsx
+!data/.gitkeep
diff --git a/code/cls/classify_kfold.py b/code/cls/classify_kfold.py
@@ -1,207 +1,6 @@
-import pandas as pd
-from sklearn.naive_bayes import GaussianNB
-from sklearn.preprocessing import StandardScaler
-import numpy as np
-from sklearn.linear_model import LogisticRegression
-import os
-from utils.pre4data import lasso_dimension_reduction, if_same
-import xgboost as xgb
-from sklearn.svm import SVC
-from lightgbm import LGBMClassifier
-from catboost import CatBoostClassifier
-from sklearn.model_selection import train_test_split
+"""5-fold cross-validation without NSE (profile: kfold)."""
 
-from utils.util import get_next_result_folder,  save_results, calculate_metrics
-
-
-def main():
-    data_train_path = './0721/0728data_delete.xlsx' 
-    ct_mode = data_train_path.split('data')[0].split('/')[-1]
-    data_train = pd.read_excel(data_train_path)
-
-    # CPC 1-2 --> label 0,  CPC3-5 --> label 1  这里你可以参照classify_lab1.py设置不同的label
-    data_train['label'] = pd.cut(data_train['CPC'], bins=[0, 2, 5], labels=[1, 0])
-    label0_num = len(data_train[data_train['label'] == 0])
-    label1_num = len(data_train[data_train['label'] == 1])
-    data_train = data_train.drop(['CPC'], axis=1)  
-    print(f"训练集形状: {data_train.shape}, 训练集时间: {ct_mode}")
-    result_folder = get_next_result_folder(base_path='./results_0728_delete')
-
-    y_index_s = data_train.iloc[:, -1]
-    scale_pos_weight = len(y_index_s[y_index_s == 0]) / len(y_index_s[y_index_s == 1])
-    scaler = StandardScaler()
-    print(f"数据划分完毕")
-
-   # 初始化分类器
-    ratio0 = len([x for x in data_train['label'].tolist() if x == 0]) / len(data_train['label'].tolist())
-    ratio1 = len([x for x in data_train['label'].tolist() if x == 1]) / len(data_train['label'].tolist())
-    clf = GaussianNB(priors=[ratio0, ratio1])    # priors=[0.5, 0.5]
-    clf = LogisticRegression()
-
-    lgbm_params = {
-        'objective': 'binary',       # 二分类任务
-        'metric': 'binary_logloss',  # 使用logloss作为评价指标
-        'learning_rate': 0.016,
-        'max_depth': 6,
-        'n_estimators': 500,
-        'subsample': 0.7,            # 构建每棵树时使用的样本比例
-        'colsample_bytree': 0.7,     # 每棵树使用的特征比例
-        'scale_pos_weight': scale_pos_weight,  # 根据数据不平衡调整正负样本的权重
-        'random_state': 42,
-    }
-    clf = LGBMClassifier(**lgbm_params)
-
-    catboost_params = {
-        'iterations': 500,           # 迭代次数
-        'depth': 6,                  # 树的深度
-        'learning_rate': 0.01,       # 学习率
-        'loss_function': 'Logloss',  # 损失函数
-        'eval_metric': 'AUC',        # AUC作为评价指标
-        'scale_pos_weight': scale_pos_weight,  # 样本不平衡的调整
-        'random_seed': 42,
-        'verbose': 0                 # 不输出训练过程
-    }
-    clf = CatBoostClassifier(**catboost_params)
-
-    xgb_params = {
-        'objective': 'binary:logistic', 'eval_metric': ['logloss'],
-        'learning_rate': 0.016,
-        'max_depth': 6,
-        'n_estimators': 600,
-        'subsample': 0.72,           # 用于构建每棵树的样本比例
-        'colsample_bytree': 0.705,   # 控制每棵树在构建时使用的特征比例
-        # 'scale_pos_weight': 1.4,   # 根据实际正负样本比例设置权重 len(y[y==0]) / len(y[y==1]),
-        'gamma': 0.1,
-        'min_child_weight': 1,       # 降低以增加模型灵活性
-        'scale_pos_weight': scale_pos_weight,
-        'random_state': 42,
-        # 'tree_method': 'hist',  
-        # 'device': 'cuda',
-    }
-    clf = xgb.XGBClassifier(**xgb_params)
-
-    clf = SVC(
-        kernel='rbf',              # 使用RBF核函数
-        C=10,                      # 正则化参数
-        gamma='auto',              # scale
-        probability=True,          # 启用概率估计
-        class_weight='balanced',   # 处理类别不平衡 
-        random_state=42)  
-
-    mode = clf.__class__.__name__
-    parameter_clf = clf.get_params()
-    print(f"classifier is: {mode}")
-    acc_scores = []
-    recall_scores = []
-    specificity_scores = []
-    precision_scores = []
-    npv_scores = []
-    auc_scores = []
-    selected_features_all = []
-    save_path = './results/roc_curve_{}_time_{}'.format(mode, ct_mode)
-    os.makedirs(save_path, exist_ok=True)
-
-    random_states = [3, 13, 42, 87, 1307]
-    results = "classifier mode is: {}\n\nrandom_states: {}\n".format(mode, random_states)
-    for fold in range(1, 6):
-        print(f"==============第 {fold} 次实验==============")
-        X = data_train.drop('label', axis=1)
-        y = data_train['label']
-        X_train, X_val, y_train, y_val = train_test_split(
-            X, y, 
-            test_size=0.2,                         # 20% 验证集
-            stratify=y,   
-            shuffle=True,  
-            random_state=random_states[fold-1]     # 用于复现以及设置不同的实验
-        ) 
-        train_index = X_train.index
-        lasso_train = data_train.iloc[train_index]
-        results += "\ntrain index: \n{}\ntest index: \n{}\n".format(X_train.index.tolist(), X_val.index.tolist())
-        results += f"train: label 0 num: {y_train.values.tolist().count(0)}, label 1 num: {y_train.values.tolist().count(1)}\n"
-        results += f"test : label 0 num: {y_val.values.tolist().count(0)}, label 1 num: {y_val.values.tolist().count(1)}\n\n\n"
-
-        # 特征降维, 需要先根据原始CT特征进行降维, 然后再把其他需要增加的特征与降维后的特征进行拼接(需要通过CTid对齐人名, 可通过pd.merge实现)
-        print(f"data for {fold} time, train shape is : {X_train.shape}")
-        data_train_lasso, selected_features, best_alphas = lasso_dimension_reduction(lasso_train)
-        X_train = data_train_lasso.iloc[:, :-1] 
-        y_train = data_train_lasso.iloc[:, -1]
-        print(f"data train shape is : {X_train.shape}")
-
-        print(f"data for {fold} time, selected features num is : {len(selected_features)}")
-        X_val = X_val[selected_features]
-
-        t = if_same(X_train, X_val)
-        if t:
-            print(f"X_train and X_val is same")
-        else:
-            print(f"X_train is not same as X_val")
-        print(f"for {fold} time, X_test.shape: {X_val.shape}, X_train.shape: {X_train.shape}")
-
-        # 标准化
-        X_train_scaled = scaler.fit_transform(X_train)  
-        X_val_scaled = scaler.transform(X_val)
-
-        sample_weights = np.ones(len(y_train))
-        sample_weights[y_train == 0] = len(y_train) / (2 * (y_train == 0).sum())
-        sample_weights[y_train == 1] = len(y_train) / (2 * (y_train == 1).sum())
-
-        print(f"训练开始")
-        clf.fit(X_train_scaled, y_train, sample_weight=sample_weights)         #贝叶斯分类器 , logic分类器
-        # clf.fit(X_train_scaled, y_train)   #SVM, xgb, lgbm, catboost分类器
-        print(f"训练完成")
-
-        y_pred = clf.predict(X_val_scaled)
-        y_prob = clf.predict_proba(X_val_scaled)[:, -1]
-
-        ACC, Recall, Specificity, Precision, NPV, roc_auc = calculate_metrics(y_val, y_pred, y_prob, save_roc_path=save_path, mode=mode)
-
-        acc_scores.append(ACC)
-        recall_scores.append(Recall)
-        specificity_scores.append(Specificity)
-        precision_scores.append(Precision)
-        npv_scores.append(NPV)
-        auc_scores.append(roc_auc) 
-        selected_features_all.append(f"次数: {fold}, number: {len(selected_features)}, features: {selected_features}\n")
-
-        print(f"第 {fold} 次实验ACC:{ACC:.3f}")
-        print(f"第 {fold} 次实验Recall:{Recall:.3f}")
-        print(f"第 {fold} 次实验Specificity:{Specificity:.3f}")
-        print(f"第 {fold} 次实验Precision:{Precision:.3f}")
-        print(f"第 {fold} 次实验NPV:{NPV:.3f}")
-        print(f"第 {fold} 次实验AUC:{roc_auc:.3f}")
-
-    final_ACC = np.mean(acc_scores)
-    final_Recall = np.mean(recall_scores)
-    final_Specificity = np.mean(specificity_scores)
-    final_Precision = np.mean(precision_scores)
-    final_NPV = np.mean(npv_scores)
-    final_AUC = np.mean(auc_scores)
-    print("\n最终测试集的具体指标值:")
-    print(f"准确率 (ACC): {final_ACC:.3f} ± {np.std(acc_scores):.3f}")
-    print(f"召回率 (Recall): {final_Recall:.3f} ± {np.std(recall_scores):.3f}")
-    print(f"特异性 (Specificity): {final_Specificity:.3f} ± {np.std(specificity_scores):.3f}")
-    print(f"精确率 (PPV): {final_Precision:.3f} ± {np.std(precision_scores):.3f}")
-    print(f"阴性预测值 (NPV): {final_NPV:.3f}  ± {np.std(npv_scores):.3f}")
-    print(f"AUC值: {final_AUC:.3f} ± {np.std(auc_scores):.3f}")
-
-    final_results = {
-        'Recall': f"{final_Recall:.3f} ± {np.std(recall_scores):.3f}",
-        'Specificity': f"{final_Specificity:.3f} ± {np.std(specificity_scores):.3f}",
-        'ACC': f"{final_ACC:.3f} ± {np.std(acc_scores):.3f}",
-        'PPV': f"{final_Precision:.3f} ± {np.std(precision_scores):.3f}",
-        'NPV': f"{final_NPV:.3f}  ± {np.std(npv_scores):.3f}",
-        'AUC': f"{final_AUC:.3f} ± {np.std(auc_scores):.3f}\n", }
-
-    results += "all dataset label 0 num: {}, label 1 num: {}\n".format(label0_num, label1_num)
-    print(f"all dataset label 0 num: {label0_num}, label 1 num: {label1_num}\n")
-    for metric, value in final_results.items():
-        results += f"\n{metric}: {value}\n"
-    for feature in selected_features_all:
-        results += f"{feature}\n"
-    results += "\n\n"
-    for pm in parameter_clf:
-        results += "\nparameter: {} \n{} values is: {}\n".format(pm, pm, parameter_clf[pm])
-    save_results(results, result_folder)
+from config.cli import main
 
 if __name__ == "__main__":
-    main()
+    main(default_profile="kfold")
diff --git a/code/cls/classify_kfold_nse.py b/code/cls/classify_kfold_nse.py
@@ -0,0 +1,6 @@
+"""5-fold cross-validation with NSE features (profile: kfold_nse)."""
+
+from config.cli import main
+
+if __name__ == "__main__":
+    main(default_profile="kfold_nse")