diff --git a/.gitignore b/.gitignore index 8dda3f8..655922b 100755 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,16 @@ target/ # Mypy cache .mypy_cache/ + +# Claude Code working state +.claude/ + +# Training logs +/logs/*.log + +# Model checkpoints downloaded from Colab (super_resolution.h5 in checkpoints/ +# is already tracked; this only catches root-level .h5 backups) +/best_*.h5 + +# Data archives at repo root +/*.zip diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..2e056b8 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,90 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +PCBSegClassNet is a TensorFlow-based deep learning project for PCB (Printed Circuit Board) component segmentation and classification. It uses the FICS PCB Image Collection (FPIC) dataset. + +The two tasks are handled by separate model variants sharing the same encoder: +- **Segmentation**: `PCBSegNet` — segments all 25 component classes on a full PCB image +- **Classification**: `PCBClassNet` — classifies individual cropped component images + +## Environment Setup + +```bash +conda create -n pscn python=3.8 +conda activate pscn +conda install pip +pip install -r requirements.txt +``` + +Key dependencies: `tensorflow-gpu==2.11`, `albumentations`, `pyyaml`, `tqdm`, `pandas`. + +## Commands + +All training commands must be run from the `src/` directory. + +**Train segmentation** (100 epochs): +```bash +python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 100 +``` + +**Evaluate segmentation** (loads best checkpoint, skips training): +```bash +python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 0 +``` + +**Train classification** (100 epochs): +```bash +python train_classification.py -opt cfs/pscn_class.yml -epoch 100 +``` + +**Evaluate classification**: +```bash +python train_classification.py -opt cfs/pscn_class.yml -epoch 0 +``` + +**Data preparation** (run from `src/data/`): +```bash +# Create HSI+CLAHE images, masks, and classification crops +python create_mask.py -i ../../data/pcb_image/ -a ../../data/smd_annotation/ -id ../../data/segmentation/images -ad ../../data/segmentation/masks -cd ../../data/classification/images/ + +# Create patches (768px) and split train/test +python create_patches.py -i ../../data/segmentation/images/ -m ../../data/segmentation/masks -cd ../../data/classification/images/ -ps 768 +``` + +## Architecture + +### Encoder (shared by both tasks) +Built in `src/models/blocks.py`, the encoder has three stages: +1. **Learning Module** — three conv/depthwise-separable conv blocks with stride 2, producing feature maps at 3 scales (`learning_layer1`, `learning_layer2`, `learning_layer3`) +2. **Feature Extractor** — three `bottleneck_block` stages (MobileNetV2-style residual bottlenecks) followed by a `pyramid_pooling_block` (PSPNet-style) +3. **Fusion Module** — fuses the learning module output with the upsampled feature extractor output + +### Segmentation Decoder (`get_decoder` in `blocks.py`) +- Applies `tem_block` (Texture Enhancement Module: channel attention + cosine-similarity-based spatial attention) to encoder output +- Two upsampling steps with skip connections from `learning_layer2` and `learning_layer1` +- Final `Conv2D(num_classes)` + softmax + +### Classification Head (`get_classification` in `blocks.py`) +- `GlobalAveragePooling2D` on encoder output → `Dense(128, relu)` → `Dense(num_classes, softmax)` + +### Loss +Segmentation uses **DISLoss** (`src/models/loss.py`): sum of Dice loss + Jaccard loss + SSIM loss. Classification uses standard `categorical_crossentropy`. + +## Configuration + +Training hyperparameters and data paths are controlled by YAML files in `src/cfs/`: +- `pscn_seg.yml` — segmentation config (25 classes, Adam lr=1e-4, batch=16, input 512×512) +- `pscn_class.yml` — classification config (25 classes, Adam lr=1e-4, batch=16, input 512×512) + +Checkpoints are saved to `checkpoints/best_seg.h5` and `checkpoints/best_class.h5`. Logs go to `logs/app.log`. + +## Data + +25 PCB component classes: R, C, U, Q, J, L, RA, D, RN, TP, IC, P, CR, M, BTN, FB, CRA, SW, T, F, V, LED, S, QA, JP. + +The segmentation masks use specific RGB color values per class (defined in `src/data/dataloader.py::color_values`). When modifying mask generation, ensure colors match this mapping exactly. + +The FPIC dataset requires access codes from the dataset authors — it is not freely downloadable. diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 0000000..f1ee2f8 --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,69 @@ +# Colab Training + +`colab_train.ipynb` is a self-contained notebook that runs the **full pipeline** end-to-end on a Colab GPU runtime: data preprocessing (mask generation + patches + train/val split) → segmentation training → classification training. + +## Quickstart + +1. **Get the raw FPIC dataset** (request access codes from the dataset authors — see top-level [README.md](../README.md)). +2. **Zip raw inputs** and upload to Drive: + ```powershell + Compress-Archive -Path data\pcb_image, data\smd_annotation -DestinationPath data_raw.zip -Force + ``` + Place at `MyDrive/PCBSegClassNet/data_raw.zip` (~7 GB). +3. **Open the notebook in Colab**: + ``` + https://colab.research.google.com/github//PCBSegClassNet/blob/colab/notebooks/colab_train.ipynb + ``` +4. **Runtime → Change runtime type → GPU** (T4 is enough; High-RAM not needed), then run cells top to bottom. + +## What the notebook does + +| Section | Purpose | +|---|---| +| 1 | `nvidia-smi` GPU sanity | +| 2 | Clone this repo (`colab` branch) | +| 3 | Install TF 2.15 + dependencies (TF 2.15 is the last release on Keras 2; Keras 3 from TF 2.16+ breaks this codebase's `tf.keras.backend.{dot,transpose}` calls) | +| 4 | Mount Drive, unzip `data_raw.zip` to local Colab disk | +| 5 | `create_mask.py` — polygon masks + classification crops (EDSR super-resolution, GPU) | +| 6 | `create_patches.py` — 768 px patches + 80/20 train/val split (CPU) | +| 7 | Set up Drive checkpoint directory for persistence across sessions | +| 8 | Segmentation training (5 epochs sanity → 80 epochs full → mirror checkpoint to Drive) | +| 9 | Classification training (same pattern) | +| 10 | Optional: re-evaluate from Drive checkpoints in a fresh session | + +## Why preprocess on Colab? + +- Raw inputs (~7 GB) are smaller than the processed dataset (~18 GB) — easier to transfer to Drive. +- Reproducibility: anyone with raw data + this notebook can recreate the exact training set without trusting an opaque processed zip. +- Easy to iterate on preprocessing knobs (e.g. patch size) without re-uploading. + +If you already have a processed dataset zip, you can skip cells 5–6 and unzip it directly into `data/` instead. + +## Why TF 2.15? + +- This repo uses `tf.keras.backend.dot` / `backend.transpose` and `tf.keras.activations.softmax(tensor)` patterns that broke in Keras 3. +- TF 2.15 is the **last TF release on Keras 2**; Keras 3 starts at TF 2.16. +- Earlier this notebook tried to pin TF 2.10 via `condacolab`, but Colab's base Python keeps moving past 3.10 and TF 2.10's wheel matrix doesn't follow. TF 2.15 ships wheels for the Python versions Colab actually serves. + +## VRAM notes + +| GPU | Comfortable batch size at 512×512 input | +|---|---| +| T4 (16 GB) | 16 | +| A100 (40 GB) | 32+ | +| L4 (24 GB) | 16-24 | +| RTX 4060 Ti (8 GB) | 4-8 (and even 8 OOMs in this codebase due to SSIM gradient) | + +The default `batch_size: 16` in `cfs/pscn_seg.yml` works on all Colab GPUs. + +## Epoch budget + +The notebook runs: +- **Sanity 5 epochs** before each full run, so you catch NaN losses or OOMs in <1 hour. +- **Full 80 epochs** for both segmentation and classification. + +80 + 80 ≈ 18 hours on an L4, fitting inside Colab Pro's 24 h session limit with margin. The original paper trained for 100 epochs; 80 leaves a safety buffer for the inevitable Drive-mount / preprocessing time at the start of a session. If you want closer to paper-faithful runs, push to 100 once you've seen one full run complete. + +## Session persistence + +Colab wipes `/content` on disconnect but Drive persists. The notebook copies the best checkpoint to Drive after each training run; section 10 shows how to restore it in a new session for evaluation. diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb new file mode 100644 index 0000000..962f7e3 --- /dev/null +++ b/notebooks/colab_train.ipynb @@ -0,0 +1,351 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PCBSegClassNet — Colab Training\n", + "\n", + "End-to-end pipeline on Google Colab GPU: data preprocessing (mask generation + patches + train/val split) → segmentation training → classification training.\n", + "\n", + "**Why Colab?** Local 8 GB GPUs (e.g. RTX 4060 Ti) are too tight for `batch=16` at 512×512 input — the segmentation decoder activation alone is ~4 GB. Colab T4 (16 GB) and above handle it comfortably.\n", + "\n", + "## Before you run\n", + "1. **Runtime → Change runtime type → GPU** (T4 is enough; High-RAM not needed).\n", + "2. Have `data_raw.zip` ready in Drive at `MyDrive/PCBSegClassNet/data_raw.zip` (~7 GB; contains `pcb_image/` + `smd_annotation/`).\n", + "3. Mount Drive when prompted in §4." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. GPU sanity check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Clone the repo\n", + "\n", + "If you forked the repo, change the URL to your fork." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd /content\n", + "!rm -rf PCBSegClassNet\n", + "!git clone -b colab https://github.com/ironmanizawesome/PCBSegClassNet.git\n", + "%cd PCBSegClassNet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Install TF 2.15 + dependencies into Python 3.11\n", + "\n", + "TF 2.15 is the last release on Keras 2 (Keras 3 starts at TF 2.16, which breaks `tf.keras.backend.{dot,transpose}` and other patterns this codebase relies on). But TF 2.15 wheels only target Python 3.9–3.11, while Colab's notebook kernel runs on Python 3.12.\n", + "\n", + "Workaround: Colab images already ship a system `python3.11` binary at `/usr/local/bin/python3.11`. Install TF 2.15 + deps **into that interpreter** and run all training scripts via `!python3.11 ...`. The notebook kernel itself stays on 3.12 — that's fine, we never `import tensorflow` from it.\n", + "\n", + "> 🔑 We can't use `tensorflow[and-cuda]==2.15.0` here — that extra pins `tensorrt-libs==8.6.1`, which is no longer available on PyPI (only 9.x remains). Installing the cudnn / cublas / cuda-runtime / etc. wheels directly is enough; TensorRT is only needed for `tf.experimental.tensorrt` inference, not training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# If python3.11 doesn't exist on this Colab image (e.g. L4 base), install it first.\n# T4 base already has /usr/local/bin/python3.11 so this is a no-op there.\n!command -v python3.11 >/dev/null || { \\\n apt-get update -q && \\\n apt-get install -y software-properties-common && \\\n add-apt-repository -y ppa:deadsnakes/ppa && \\\n apt-get update -q && \\\n apt-get install -y python3.11 python3.11-dev python3.11-distutils && \\\n curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11; \\\n}\n\n# TF + non-CUDA python deps\n!python3.11 -m pip install -q tensorflow==2.15.0 albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn\n\n# CUDA libs pinned to versions matching TF 2.15. The latest nvidia-cudnn-cu12 is\n# 9.x which TF 2.15 cannot dlopen (it links against libcudnn.so.8 specifically),\n# so version pinning is mandatory here.\n!python3.11 -m pip install -q \\\n nvidia-cudnn-cu12==8.9.4.25 \\\n nvidia-cublas-cu12==12.2.5.6 \\\n nvidia-cuda-cupti-cu12==12.2.142 \\\n nvidia-cuda-nvrtc-cu12==12.2.140 \\\n nvidia-cuda-runtime-cu12==12.2.140 \\\n nvidia-cufft-cu12==11.0.8.103 \\\n nvidia-curand-cu12==10.3.3.141 \\\n nvidia-cusolver-cu12==11.5.2.141 \\\n nvidia-cusparse-cu12==12.1.2.141 \\\n nvidia-nccl-cu12==2.16.5" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Verify TF + GPU under Python 3.11 (the interpreter that will actually run training).\n# `tf.keras.__version__` raises AttributeError on TF 2.15 due to a lazy_loader quirk,\n# so we deliberately don't print it here. Skip-tested: works on both T4 and L4 images.\n!python3.11 -c \"import sys, tensorflow as tf; print('Python:', sys.version.split()[0]); print('TF:', tf.__version__); print('GPU:', tf.config.list_physical_devices('GPU'))\"" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Mount Drive and unpack the raw FPIC archive\n", + "\n", + "This notebook does the **entire data prep pipeline** (mask generation + patches + train/val split) in Colab so you only need to upload the raw FPIC images + annotations (~7 GB) instead of the processed dataset (~18 GB).\n", + "\n", + "### Data layout expected on Drive\n", + "Zip the **raw** FPIC images + annotations together:\n", + "\n", + "```\n", + "/MyDrive/PCBSegClassNet/\n", + " data_raw.zip ← contains: pcb_image/*.png + smd_annotation/*.csv\n", + " checkpoints/ ← (optional, for resume / saved best models)\n", + "```\n", + "\n", + "To make the zip on a Windows host:\n", + "\n", + "```powershell\n", + "Compress-Archive -Path data\\pcb_image, data\\smd_annotation -DestinationPath data_raw.zip -Force\n", + "```\n", + "\n", + "Why unzip to local disk and not stream from Drive? Drive mounts thousands of small files extremely slowly (API throttling). Always unpack to `/content` for training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.colab import drive\n", + "drive.mount(\"/content/drive\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "RAW_ZIP = \"/content/drive/MyDrive/PCBSegClassNet/data_raw.zip\"\n", + "\n", + "import os, time\n", + "assert os.path.exists(RAW_ZIP), f\"Not found: {RAW_ZIP}\"\n", + "\n", + "%cd /content/PCBSegClassNet\n", + "!mkdir -p data\n", + "t0 = time.time()\n", + "!unzip -q -o {RAW_ZIP} -d data/\n", + "print(f\"Unzip done in {time.time()-t0:.1f}s\")\n", + "\n", + "!echo \"--- pcb_image:\"; ls data/pcb_image/ | wc -l\n", + "!echo \"--- smd_annotation:\"; ls data/smd_annotation/ | wc -l" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Generate masks + classification crops (`create_mask.py`)\n", + "\n", + "Runs through all annotation CSVs, fills polygon masks per component class, and writes:\n", + "- `data/segmentation/images/` — HSI + CLAHE preprocessed PCB images\n", + "- `data/segmentation/masks/` — RGB masks (color-encoded per class)\n", + "- `data/classification/images//` — individual component crops upscaled with the EDSR super-resolution model in `checkpoints/super_resolution.h5`\n", + "\n", + "GPU-accelerated via the EDSR forward pass. Expect ~10–30 minutes depending on Colab GPU." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd /content/PCBSegClassNet/src/data\n", + "!python3.11 create_mask.py \\\n", + " -i ../../data/pcb_image/ \\\n", + " -a ../../data/smd_annotation/ \\\n", + " -id ../../data/segmentation/images \\\n", + " -ad ../../data/segmentation/masks \\\n", + " -cd ../../data/classification/images/\n", + "\n", + "!echo \"--- segmentation/images: $(ls ../../data/segmentation/images 2>/dev/null | wc -l)\"\n", + "!echo \"--- segmentation/masks: $(ls ../../data/segmentation/masks 2>/dev/null | wc -l)\"\n", + "!echo \"--- classification crops total: $(find ../../data/classification/images -type f | wc -l)\"\n", + "!echo \"--- classification classes:\"; ls ../../data/classification/images 2>/dev/null" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Cut 768 px patches and split into train/val (`create_patches.py`)\n", + "\n", + "Cuts the full PCB images + masks into 768×768 patches and moves the patches + classification crops into `train/` and `val/` subfolders (80/20 split). Pure CPU work, ~5 minutes.\n", + "\n", + "After this cell, the dataset layout matches what the training scripts expect:\n", + "\n", + "```\n", + "data/segmentation/train/{images,masks}/*.png\n", + "data/segmentation/val/{images,masks}/*.png\n", + "data/classification/train//*.png\n", + "data/classification/val//*.png\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd /content/PCBSegClassNet/src/data\n", + "!python3.11 create_patches.py \\\n", + " -i ../../data/segmentation/images/ \\\n", + " -m ../../data/segmentation/masks \\\n", + " -cd ../../data/classification/images/ \\\n", + " -ps 768\n", + "\n", + "!echo \"--- seg train: $(ls ../../data/segmentation/train/images 2>/dev/null | wc -l) images / $(ls ../../data/segmentation/train/masks 2>/dev/null | wc -l) masks\"\n", + "!echo \"--- seg val: $(ls ../../data/segmentation/val/images 2>/dev/null | wc -l) images / $(ls ../../data/segmentation/val/masks 2>/dev/null | wc -l) masks\"\n", + "!echo \"--- class train: $(find ../../data/classification/train -type f 2>/dev/null | wc -l) crops\"\n", + "!echo \"--- class val: $(find ../../data/classification/val -type f 2>/dev/null | wc -l) crops\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. (Optional) Mirror checkpoints to Drive for persistence\n", + "\n", + "Colab local disk is wiped on session end. Save best model files back to Drive at the end of training. For now, just record the path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DRIVE_CKPT_DIR = \"/content/drive/MyDrive/PCBSegClassNet/checkpoints\"\n", + "!mkdir -p {DRIVE_CKPT_DIR}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Train segmentation\n", + "\n", + "Default config in `cfs/pscn_seg.yml` is `batch_size=16`, `epochs` controlled by `-epoch`.\n", + "\n", + "**First run a 5-epoch sanity pass.** If loss is finite and val_dice_coef is improving, kick off the full 40 epochs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Sanity check: 5 epochs\n# TF_GPU_ALLOCATOR=cuda_malloc_async reduces fragmentation OOMs on 16 GB GPUs\n# (the SSIM gradient in DISLoss spikes a 416 MB tensor that can fail to fit\n# even on T4 16 GB without async allocator). On L4 24 GB it's belt-and-braces.\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 5" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Full training run (80 epochs)\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 80" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Backup the best seg checkpoint to Drive\n", + "!cp /content/PCBSegClassNet/checkpoints/best_seg.h5 {DRIVE_CKPT_DIR}/best_seg.h5\n", + "!ls -la {DRIVE_CKPT_DIR}" + ] + }, + { + "cell_type": "markdown", + "source": "### 8b. (Optional) Fine-tune segmentation from best checkpoint\n\nThe first full run leaves `lr` at the `min_lr` (1e-5) of `ReduceLROnPlateau`. To squeeze more out of the model, resume from `best_seg.h5` with `pscn_seg_finetune.yml` — same architecture but `lr=1e-5` start and `min_lr=1e-6` so the plateau callback can step down further.\n\nIf you're running this in a fresh session, the first cell below restores `best_seg.h5` from Drive into Colab local disk (the resume flag loads weights from there).", + "metadata": {} + }, + { + "cell_type": "code", + "source": "# Backup fine-tuned best_seg.h5 to Drive (ModelCheckpoint overwrites in place\n# whenever val_dice_coef improves, so this captures the best of the two runs)\n!cp /content/PCBSegClassNet/checkpoints/best_seg.h5 {DRIVE_CKPT_DIR}/best_seg.h5\n!ls -la {DRIVE_CKPT_DIR}", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": "# Make sure best_seg.h5 exists locally (restore from Drive if fresh session)\n!mkdir -p /content/PCBSegClassNet/checkpoints\n!test -f /content/PCBSegClassNet/checkpoints/best_seg.h5 || cp {DRIVE_CKPT_DIR}/best_seg.h5 /content/PCBSegClassNet/checkpoints/\n!ls -la /content/PCBSegClassNet/checkpoints/best_seg.h5\n\n# Fine-tune: 20 epochs, lr=1e-5 → min_lr=1e-6, resume from best_seg.h5\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg_finetune.yml -epoch 20 -resume", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Train classification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Sanity check: 5 epochs\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 5" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Full training run (80 epochs)\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 80" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Backup the best classification checkpoint to Drive\n", + "!cp /content/PCBSegClassNet/checkpoints/best_class.h5 {DRIVE_CKPT_DIR}/best_class.h5\n", + "!ls -la {DRIVE_CKPT_DIR}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. (Optional) Evaluate without retraining\n", + "\n", + "Pass `-epoch 0` to skip training; the script will load `best_*.h5` from `checkpoints/` and run `model.evaluate(val_dataset)`. Make sure the checkpoint is in `/content/PCBSegClassNet/checkpoints/` (copy it back from Drive if you reconnected)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Restore checkpoints from Drive after a fresh session\n!mkdir -p /content/PCBSegClassNet/checkpoints\n!cp {DRIVE_CKPT_DIR}/best_seg.h5 /content/PCBSegClassNet/checkpoints/ 2>/dev/null || echo 'no seg ckpt'\n!cp {DRIVE_CKPT_DIR}/best_class.h5 /content/PCBSegClassNet/checkpoints/ 2>/dev/null || echo 'no class ckpt'\n\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 0" + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "PCBSegClassNet — Colab Training", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/src/cfs/pscn_seg_finetune.yml b/src/cfs/pscn_seg_finetune.yml new file mode 100644 index 0000000..7ca6ddc --- /dev/null +++ b/src/cfs/pscn_seg_finetune.yml @@ -0,0 +1,86 @@ +# ------------------------------------------------------------------------ +# Copyright (c) 2023 CandleLabAI. All Rights Reserved. +# ------------------------------------------------------------------------ +# Fine-tune config: same model, smaller learning rate. Use with -resume so +# weights are loaded from best_seg.h5 before training continues. +# general settings +name: PCBSegClassNet +model_type: SegmentationModel + +datasets: + train: + name: FPIC + type: Segmentation + data_images: ../data/segmentation/train/images/ + data_masks: ../data/segmentation/train/masks/ + + img_size_h: 512 + img_size_w: 512 + + # data loader + use_shuffle: true + batch_size: 16 + + val: + name: FPIC + type: Segmentation + data_images: ../data/segmentation/val/images/ + data_masks: ../data/segmentation/val/masks/ + + img_size_h: 512 + img_size_w: 512 + + use_shuffle: false + batch_size: 1 + +# path +path: + checkpoint_network: ../checkpoints/best_seg.h5 + log_file: ../logs/app.log + +# training settings +train: + optim: + type: Adam + lr: 0.00001 # 1e-5 — start where the previous run left off + betas: [0.9, 0.9] + + callbacks: + modelcheckpoint: + type: ModelCheckpoint + monitor: val_dice_coef + mode: max + verbose: 1 + save_best_only: true + save_weights_only: false + + reducelronplateau: + type: ReduceLROnPlateau + monitor: val_loss + mode: min + verbose: 1 + factor: 0.1 + patience: 15 + min_lr: 0.000001 # 1e-6 — allow finer adjustments than the initial run + + num_classes: 25 + + # losses + loss: + type: DISLoss + + # metrics + metric: + DICE: + type: dice_coef + IoU: + type: jacard_coef + +# val settings +val: + # metrics + metric: + DICE: + type: DiceCoef + IoU: + type: IoU diff --git a/src/models/network.py b/src/models/network.py index 8a1da70..15253b5 100755 --- a/src/models/network.py +++ b/src/models/network.py @@ -52,7 +52,7 @@ def build(self): """ build encoder and final model """ - encoder = get_encoder(self.image_height, self.image_width) + encoder, _, _ = get_encoder(self.image_height, self.image_width) model = get_classification(encoder, self.num_classes) return model diff --git a/src/train_segmentation.py b/src/train_segmentation.py index d51e2bb..fbda5d3 100755 --- a/src/train_segmentation.py +++ b/src/train_segmentation.py @@ -34,10 +34,14 @@ def parse_config(): type=int, default=1, help="number of epochs.") + parser.add_argument("-resume", + action="store_true", + help="resume training from existing best checkpoint.") args = parser.parse_args() opt = parse(args.opt) opt["train"]["total_epochs"] = args.epoch + opt["train"]["resume"] = args.resume return opt def init_log(opt): @@ -110,6 +114,12 @@ def main(): ) ) + # resume from existing checkpoint if requested + import os + if opt["train"].get("resume") and os.path.exists(opt["path"]["checkpoint_network"]): + logger.info(f"Resuming from {opt['path']['checkpoint_network']}") + model.load_weights(opt["path"]["checkpoint_network"]) + # training model if opt["train"]["total_epochs"] > 0: logger.info(f"Training for {opt['train']['total_epochs']} epochs")