From 58c2b7310e754f1899c23192d150934ef62d018c Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Wed, 6 May 2026 02:18:43 +0900 Subject: [PATCH 01/14] fix(models): unpack get_encoder tuple in PCBClassNet build PCBClassNet.build() was passing the (model, learning_layer1, learning_layer2) tuple straight into get_classification, which expects a single Keras Model. Unpack so the classification head receives the encoder model as intended, making the classification path actually buildable. Also adds CLAUDE.md (project guidance) and ignores .claude/ working state plus training log files. Co-Authored-By: Claude Opus 4.7 --- .gitignore | 6 +++ CLAUDE.md | 90 +++++++++++++++++++++++++++++++++++++++++++ src/models/network.py | 2 +- 3 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 CLAUDE.md diff --git a/.gitignore b/.gitignore index 8dda3f8..d4519f0 100755 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,9 @@ target/ # Mypy cache .mypy_cache/ + +# Claude Code working state +.claude/ + +# Training logs +/logs/*.log diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..2e056b8 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,90 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +PCBSegClassNet is a TensorFlow-based deep learning project for PCB (Printed Circuit Board) component segmentation and classification. It uses the FICS PCB Image Collection (FPIC) dataset. + +The two tasks are handled by separate model variants sharing the same encoder: +- **Segmentation**: `PCBSegNet` — segments all 25 component classes on a full PCB image +- **Classification**: `PCBClassNet` — classifies individual cropped component images + +## Environment Setup + +```bash +conda create -n pscn python=3.8 +conda activate pscn +conda install pip +pip install -r requirements.txt +``` + +Key dependencies: `tensorflow-gpu==2.11`, `albumentations`, `pyyaml`, `tqdm`, `pandas`. + +## Commands + +All training commands must be run from the `src/` directory. + +**Train segmentation** (100 epochs): +```bash +python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 100 +``` + +**Evaluate segmentation** (loads best checkpoint, skips training): +```bash +python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 0 +``` + +**Train classification** (100 epochs): +```bash +python train_classification.py -opt cfs/pscn_class.yml -epoch 100 +``` + +**Evaluate classification**: +```bash +python train_classification.py -opt cfs/pscn_class.yml -epoch 0 +``` + +**Data preparation** (run from `src/data/`): +```bash +# Create HSI+CLAHE images, masks, and classification crops +python create_mask.py -i ../../data/pcb_image/ -a ../../data/smd_annotation/ -id ../../data/segmentation/images -ad ../../data/segmentation/masks -cd ../../data/classification/images/ + +# Create patches (768px) and split train/test +python create_patches.py -i ../../data/segmentation/images/ -m ../../data/segmentation/masks -cd ../../data/classification/images/ -ps 768 +``` + +## Architecture + +### Encoder (shared by both tasks) +Built in `src/models/blocks.py`, the encoder has three stages: +1. **Learning Module** — three conv/depthwise-separable conv blocks with stride 2, producing feature maps at 3 scales (`learning_layer1`, `learning_layer2`, `learning_layer3`) +2. **Feature Extractor** — three `bottleneck_block` stages (MobileNetV2-style residual bottlenecks) followed by a `pyramid_pooling_block` (PSPNet-style) +3. **Fusion Module** — fuses the learning module output with the upsampled feature extractor output + +### Segmentation Decoder (`get_decoder` in `blocks.py`) +- Applies `tem_block` (Texture Enhancement Module: channel attention + cosine-similarity-based spatial attention) to encoder output +- Two upsampling steps with skip connections from `learning_layer2` and `learning_layer1` +- Final `Conv2D(num_classes)` + softmax + +### Classification Head (`get_classification` in `blocks.py`) +- `GlobalAveragePooling2D` on encoder output → `Dense(128, relu)` → `Dense(num_classes, softmax)` + +### Loss +Segmentation uses **DISLoss** (`src/models/loss.py`): sum of Dice loss + Jaccard loss + SSIM loss. Classification uses standard `categorical_crossentropy`. + +## Configuration + +Training hyperparameters and data paths are controlled by YAML files in `src/cfs/`: +- `pscn_seg.yml` — segmentation config (25 classes, Adam lr=1e-4, batch=16, input 512×512) +- `pscn_class.yml` — classification config (25 classes, Adam lr=1e-4, batch=16, input 512×512) + +Checkpoints are saved to `checkpoints/best_seg.h5` and `checkpoints/best_class.h5`. Logs go to `logs/app.log`. + +## Data + +25 PCB component classes: R, C, U, Q, J, L, RA, D, RN, TP, IC, P, CR, M, BTN, FB, CRA, SW, T, F, V, LED, S, QA, JP. + +The segmentation masks use specific RGB color values per class (defined in `src/data/dataloader.py::color_values`). When modifying mask generation, ensure colors match this mapping exactly. + +The FPIC dataset requires access codes from the dataset authors — it is not freely downloadable. diff --git a/src/models/network.py b/src/models/network.py index 8a1da70..15253b5 100755 --- a/src/models/network.py +++ b/src/models/network.py @@ -52,7 +52,7 @@ def build(self): """ build encoder and final model """ - encoder = get_encoder(self.image_height, self.image_width) + encoder, _, _ = get_encoder(self.image_height, self.image_width) model = get_classification(encoder, self.num_classes) return model From 452e8b9076c170ff2a2aee9cbeee4a3c06073f18 Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Wed, 6 May 2026 03:31:44 +0900 Subject: [PATCH 02/14] docs(colab): add notebook + readme for Colab training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds notebooks/colab_train.ipynb covering the full pipeline (clone, TF 2.10 pin, Drive mount, data unzip, seg + class training with checkpoint backup to Drive) so an 8 GB local GPU isn't a blocker. Pins TF 2.10.1 + keras 2.10 + protobuf 3.19.6 in the install cell — Colab's bundled TF (2.15 with Keras 3) breaks `tf.keras.activations.softmax` calls and a few other patterns this codebase relies on. notebooks/README.md captures the data zip layout, why TF 2.10, and a VRAM cheat sheet for the common Colab GPUs. Co-Authored-By: Claude Opus 4.7 --- notebooks/README.md | 47 ++++++ notebooks/colab_train.ipynb | 294 ++++++++++++++++++++++++++++++++++++ 2 files changed, 341 insertions(+) create mode 100644 notebooks/README.md create mode 100644 notebooks/colab_train.ipynb diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 0000000..dd77c2f --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,47 @@ +# Colab Training + +`colab_train.ipynb` is a self-contained notebook that runs the full training pipeline (segmentation + classification) on a Colab GPU runtime. + +## Quickstart + +1. **Prepare the dataset locally** (see top-level [README.md](../README.md) for `create_mask.py` → `create_patches.py`). +2. **Zip the prepared `data/` directory** (segmentation/ + classification/ subfolders) and upload to Drive at `MyDrive/PCBSegClassNet/data.zip`. +3. **Open the notebook in Colab**: from GitHub the easiest path is the `Open in Colab` Chrome extension, or use the URL form: + ``` + https://colab.research.google.com/github//PCBSegClassNet/blob/colab/notebooks/colab_train.ipynb + ``` +4. **Runtime → Change runtime type → GPU**, then run cells top to bottom. + +## What the notebook does + +| Cell | Purpose | +|---|---| +| 1 | `nvidia-smi` GPU sanity | +| 2 | Clone this repo (`colab` branch) | +| 3 | Pin TF 2.10.1 + matching keras / protobuf / numpy (the codebase isn't compatible with Keras 3) | +| 4 | Mount Drive, unzip `data.zip` to local Colab disk (≪ Drive in IO speed) | +| 5 | Set up Drive checkpoint directory for persistence across sessions | +| 6 | Segmentation training (5 epochs sanity → 100 epochs full → mirror checkpoint to Drive) | +| 7 | Classification training (same pattern) | +| 8 | Optional: re-evaluate from Drive checkpoints in a fresh session | + +## Why TF 2.10 specifically? + +- This repo uses `tf.keras.activations.softmax(tensor)` and other patterns that broke in Keras 3. +- TF 2.10 was the last release with native Windows GPU; verified to work end-to-end. +- Colab's bundled TF (2.15+ with Keras 3) can produce `AttributeError`s on import without changes to the codebase. + +## VRAM notes + +| GPU | Comfortable batch size at 512×512 input | +|---|---| +| T4 (16 GB) | 16 | +| A100 (40 GB) | 32+ | +| L4 (24 GB) | 16-24 | +| RTX 4060 Ti (8 GB) | 4-8 (and even 8 OOMs in this codebase due to SSIM gradient) | + +The default `batch_size: 16` in `cfs/pscn_seg.yml` works on all Colab GPUs. + +## Session persistence + +Colab wipes `/content` on disconnect but Drive persists. The notebook copies the best checkpoint to Drive after each run; cell 8 shows how to restore it in a new session for evaluation. diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb new file mode 100644 index 0000000..32ef1d0 --- /dev/null +++ b/notebooks/colab_train.ipynb @@ -0,0 +1,294 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PCBSegClassNet — Colab Training\n", + "\n", + "Train PCBSegNet (segmentation) and PCBClassNet (classification) on Google Colab GPU.\n", + "\n", + "**Why Colab?** Local 8 GB GPU (e.g. RTX 4060 Ti) is too tight for `batch=16` at 512×512 input — decoder activation alone is ~4 GB. Colab T4 (16 GB) or A100 (40 GB) handles it comfortably.\n", + "\n", + "## Before you run\n", + "1. **Runtime → Change runtime type → GPU** (T4 / A100 / L4 — whatever you have).\n", + "2. Have your dataset ready as a zip in Google Drive (see *Data layout* below).\n", + "3. Mount Drive when prompted in the relevant cell." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. GPU sanity check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Clone the repo\n", + "\n", + "If you forked it, change the URL to your fork." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd /content\n", + "!rm -rf PCBSegClassNet\n", + "!git clone -b colab https://github.com/ironmanizawesome/PCBSegClassNet.git\n", + "%cd PCBSegClassNet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Install dependencies\n", + "\n", + "Pin to TF 2.10 (the version this repo was authored against). Colab's bundled TF is often newer (Keras 3) which breaks `tf.keras.activations.softmax(...)` patterns and a few other APIs in this codebase." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q \\\n", + " tensorflow==2.10.1 \\\n", + " keras==2.10.0 \\\n", + " tensorflow-estimator==2.10.0 \\\n", + " protobuf==3.19.6 \\\n", + " numpy==1.24.4\n", + "\n", + "!pip install -q albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "print(\"TF:\", tf.__version__)\n", + "print(\"GPU:\", tf.config.list_physical_devices(\"GPU\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Mount Drive and unpack dataset\n", + "\n", + "### Data layout expected on Drive\n", + "Recommended: zip the prepared dataset and store it on Drive.\n", + "\n", + "```\n", + "/MyDrive/PCBSegClassNet/\n", + " data.zip ← zipped contents of data/ (segmentation/, classification/)\n", + " checkpoints/ ← (optional, for resume / saved best models)\n", + "```\n", + "\n", + "Inside `data.zip` the structure should match what `create_patches.py` produced:\n", + "```\n", + "segmentation/train/images/*.png\n", + "segmentation/train/masks/*.png\n", + "segmentation/val/images/*.png\n", + "segmentation/val/masks/*.png\n", + "classification/train//*.png\n", + "classification/val//*.png\n", + "```\n", + "\n", + "Why unzip to local disk and not stream from Drive? Drive mounts thousands of small files extremely slowly (API throttling). Always unpack to `/content` for training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.colab import drive\n", + "drive.mount(\"/content/drive\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Adjust path if you stored data.zip elsewhere\n", + "DATA_ZIP = \"/content/drive/MyDrive/PCBSegClassNet/data.zip\"\n", + "\n", + "import os, time\n", + "assert os.path.exists(DATA_ZIP), f\"Not found: {DATA_ZIP}\"\n", + "\n", + "%cd /content/PCBSegClassNet\n", + "!mkdir -p data\n", + "t0 = time.time()\n", + "!unzip -q -o {DATA_ZIP} -d data/\n", + "print(f\"Unzip done in {time.time()-t0:.1f}s\")\n", + "\n", + "!echo \"--- segmentation ---\"; ls data/segmentation/ 2>/dev/null\n", + "!echo \"--- classification ---\"; ls data/classification/ 2>/dev/null" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. (Optional) Mirror checkpoints to Drive for persistence\n", + "\n", + "Colab local disk is wiped on session end. Save best model files back to Drive at the end of training (or set up a callback). For now, just record the path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DRIVE_CKPT_DIR = \"/content/drive/MyDrive/PCBSegClassNet/checkpoints\"\n", + "!mkdir -p {DRIVE_CKPT_DIR}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Train segmentation\n", + "\n", + "Default config in `cfs/pscn_seg.yml` is `batch_size=16`, `epochs` controlled by `-epoch`.\n", + "\n", + "**First run a 5-epoch sanity pass.** If loss is finite and val_dice_coef is improving, kick off the full 100 epochs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd /content/PCBSegClassNet/src\n", + "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Full training run\n", + "%cd /content/PCBSegClassNet/src\n", + "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Backup the best seg checkpoint to Drive\n", + "!cp /content/PCBSegClassNet/checkpoints/best_seg.h5 {DRIVE_CKPT_DIR}/best_seg.h5\n", + "!ls -la {DRIVE_CKPT_DIR}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Train classification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd /content/PCBSegClassNet/src\n", + "!python train_classification.py -opt cfs/pscn_class.yml -epoch 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd /content/PCBSegClassNet/src\n", + "!python train_classification.py -opt cfs/pscn_class.yml -epoch 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Backup the best classification checkpoint to Drive\n", + "!cp /content/PCBSegClassNet/checkpoints/best_class.h5 {DRIVE_CKPT_DIR}/best_class.h5\n", + "!ls -la {DRIVE_CKPT_DIR}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. (Optional) Evaluate without retraining\n", + "\n", + "Pass `-epoch 0` to skip training; the script will load `best_*.h5` from `checkpoints/` and run `model.evaluate(val_dataset)`. Make sure the checkpoint is in `/content/PCBSegClassNet/checkpoints/` (copy it back from Drive if you reconnected)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Restore checkpoints from Drive after a fresh session\n", + "!mkdir -p /content/PCBSegClassNet/checkpoints\n", + "!cp {DRIVE_CKPT_DIR}/best_seg.h5 /content/PCBSegClassNet/checkpoints/ 2>/dev/null || echo 'no seg ckpt'\n", + "!cp {DRIVE_CKPT_DIR}/best_class.h5 /content/PCBSegClassNet/checkpoints/ 2>/dev/null || echo 'no class ckpt'\n", + "\n", + "%cd /content/PCBSegClassNet/src\n", + "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 0" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "PCBSegClassNet — Colab Training", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 065222650080677aef378a9a8c0fffc50d1d898e Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Wed, 6 May 2026 04:03:36 +0900 Subject: [PATCH 03/14] docs(colab): use condacolab to pin Python 3.10 + TF 2.10 stack Colab's default Python is 3.12, which has no TF 2.10 wheels available (`pip install tensorflow==2.10.1` fails with "No matching distribution"). Insert a condacolab.install() step that swaps the kernel to a Python 3.10 base, then install the verified TF 2.10 stack on top. The kernel auto-restarts after condacolab.install(); the cloned repo on /content survives the restart so subsequent cells just resume. Co-Authored-By: Claude Opus 4.7 --- notebooks/README.md | 2 +- notebooks/colab_train.ipynb | 32 +++++++++++--------------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/notebooks/README.md b/notebooks/README.md index dd77c2f..00bbe82 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -18,7 +18,7 @@ |---|---| | 1 | `nvidia-smi` GPU sanity | | 2 | Clone this repo (`colab` branch) | -| 3 | Pin TF 2.10.1 + matching keras / protobuf / numpy (the codebase isn't compatible with Keras 3) | +| 3 | Swap kernel to Python 3.10 base via `condacolab`, then pin TF 2.10.1 + matching keras / protobuf / numpy. Colab's default Python 3.12 has no TF 2.10 wheels, and this codebase isn't compatible with Keras 3 (TF 2.16+) | | 4 | Mount Drive, unzip `data.zip` to local Colab disk (≪ Drive in IO speed) | | 5 | Set up Drive checkpoint directory for persistence across sessions | | 6 | Segmentation training (5 epochs sanity → 100 epochs full → mirror checkpoint to Drive) | diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index 32ef1d0..f0072fa 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -56,38 +56,28 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 3. Install dependencies\n", - "\n", - "Pin to TF 2.10 (the version this repo was authored against). Colab's bundled TF is often newer (Keras 3) which breaks `tf.keras.activations.softmax(...)` patterns and a few other APIs in this codebase." - ] + "source": "## 3. Set up Python 3.10 + TF 2.10 stack\n\nColab's default Python is now 3.12, but TF 2.10 only ships wheels for Python 3.7–3.10. Use `condacolab` to swap the runtime to a Python 3.10 base, then pin the TF 2.10 stack on top.\n\n**Why TF 2.10?** This codebase calls `tf.keras.backend.dot` / `backend.transpose` and a couple of other APIs that broke in Keras 3 (TF 2.16+). TF 2.10 is verified to run end-to-end.\n\n> ⚠️ The next cell **restarts the kernel** automatically. Wait for it to reconnect, then continue with the cells after it. The cloned repo at `/content/PCBSegClassNet` survives the restart (it lives on Colab's local disk)." }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "!pip install -q \\\n", - " tensorflow==2.10.1 \\\n", - " keras==2.10.0 \\\n", - " tensorflow-estimator==2.10.0 \\\n", - " protobuf==3.19.6 \\\n", - " numpy==1.24.4\n", - "\n", - "!pip install -q albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn" - ] + "source": "!pip install -q condacolab\nimport condacolab\ncondacolab.install() # kernel restarts automatically — rerun cells below afterwards" }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "import tensorflow as tf\n", - "print(\"TF:\", tf.__version__)\n", - "print(\"GPU:\", tf.config.list_physical_devices(\"GPU\"))" - ] + "source": "!pip install -q \\\n tensorflow==2.10.1 \\\n keras==2.10.0 \\\n tensorflow-estimator==2.10.0 \\\n protobuf==3.19.6 \\\n numpy==1.24.4\n\n!pip install -q albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn" + }, + { + "cell_type": "code", + "source": "import sys, tensorflow as tf\nprint(\"Python:\", sys.version.split()[0])\nprint(\"TF:\", tf.__version__)\nprint(\"GPU:\", tf.config.list_physical_devices(\"GPU\"))", + "metadata": {}, + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -291,4 +281,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file From 0c76fe8bdf1d31d93d366528805ec0eabc75cf56 Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Sun, 10 May 2026 16:37:05 +0900 Subject: [PATCH 04/14] docs(colab): preprocess in Colab + cap full training at 40 epochs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructures the notebook so the entire data prep pipeline runs in Colab from the raw FPIC archive (~7 GB) instead of requiring a pre-zipped processed dataset (~18 GB): - §4 unzips data_raw.zip (pcb_image + smd_annotation) - §5 runs create_mask.py (GPU-accelerated EDSR upscaling) - §6 runs create_patches.py (768 px patches + 80/20 train/val split) - §§7-10 unchanged training/eval flow with section numbers shifted Caps full training at 40 epochs for both segmentation and classification. Colab Pro caps a single session at 24 h with a 90-min idle limit and no background execution; Seg 100 + Class 100 (~30-37 h) cannot fit. Seg 40 + Class 40 fits comfortably in roughly 12 h on a T4. Co-Authored-By: Claude Opus 4.7 --- notebooks/README.md | 38 +++++++++----- notebooks/colab_train.ipynb | 99 +++++++++++++------------------------ 2 files changed, 60 insertions(+), 77 deletions(-) diff --git a/notebooks/README.md b/notebooks/README.md index 00bbe82..aa5b648 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -1,12 +1,16 @@ # Colab Training -`colab_train.ipynb` is a self-contained notebook that runs the full training pipeline (segmentation + classification) on a Colab GPU runtime. +`colab_train.ipynb` is a self-contained notebook that runs the **full pipeline** end-to-end on a Colab GPU runtime: data preprocessing (mask generation + patches + train/val split) → segmentation training → classification training. ## Quickstart -1. **Prepare the dataset locally** (see top-level [README.md](../README.md) for `create_mask.py` → `create_patches.py`). -2. **Zip the prepared `data/` directory** (segmentation/ + classification/ subfolders) and upload to Drive at `MyDrive/PCBSegClassNet/data.zip`. -3. **Open the notebook in Colab**: from GitHub the easiest path is the `Open in Colab` Chrome extension, or use the URL form: +1. **Get the raw FPIC dataset** (request access codes from the dataset authors — see top-level [README.md](../README.md)). +2. **Zip raw inputs** and upload to Drive: + ```powershell + Compress-Archive -Path data\pcb_image, data\smd_annotation -DestinationPath data_raw.zip -Force + ``` + Place at `MyDrive/PCBSegClassNet/data_raw.zip` (~7 GB). +3. **Open the notebook in Colab**: ``` https://colab.research.google.com/github//PCBSegClassNet/blob/colab/notebooks/colab_train.ipynb ``` @@ -14,20 +18,30 @@ ## What the notebook does -| Cell | Purpose | +| Section | Purpose | |---|---| | 1 | `nvidia-smi` GPU sanity | | 2 | Clone this repo (`colab` branch) | | 3 | Swap kernel to Python 3.10 base via `condacolab`, then pin TF 2.10.1 + matching keras / protobuf / numpy. Colab's default Python 3.12 has no TF 2.10 wheels, and this codebase isn't compatible with Keras 3 (TF 2.16+) | -| 4 | Mount Drive, unzip `data.zip` to local Colab disk (≪ Drive in IO speed) | -| 5 | Set up Drive checkpoint directory for persistence across sessions | -| 6 | Segmentation training (5 epochs sanity → 100 epochs full → mirror checkpoint to Drive) | -| 7 | Classification training (same pattern) | -| 8 | Optional: re-evaluate from Drive checkpoints in a fresh session | +| 4 | Mount Drive, unzip `data_raw.zip` to local Colab disk | +| 5 | `create_mask.py` — polygon masks + classification crops (EDSR super-resolution, GPU) | +| 6 | `create_patches.py` — 768 px patches + 80/20 train/val split (CPU) | +| 7 | Set up Drive checkpoint directory for persistence across sessions | +| 8 | Segmentation training (5 epochs sanity → 100 epochs full → mirror checkpoint to Drive) | +| 9 | Classification training (same pattern) | +| 10 | Optional: re-evaluate from Drive checkpoints in a fresh session | + +## Why preprocess on Colab? + +- Raw inputs (~7 GB) are smaller than the processed dataset (~18 GB) — easier to transfer to Drive. +- Reproducibility: anyone with raw data + this notebook can recreate the exact training set without trusting an opaque processed zip. +- Easy to iterate on preprocessing knobs (e.g. patch size) without re-uploading. + +If you already have a processed dataset zip, you can skip cells 5–6 and unzip it directly into `data/` instead. ## Why TF 2.10 specifically? -- This repo uses `tf.keras.activations.softmax(tensor)` and other patterns that broke in Keras 3. +- This repo uses `tf.keras.activations.softmax(tensor)` and `tf.keras.backend.{dot,transpose}` patterns that broke in Keras 3. - TF 2.10 was the last release with native Windows GPU; verified to work end-to-end. - Colab's bundled TF (2.15+ with Keras 3) can produce `AttributeError`s on import without changes to the codebase. @@ -44,4 +58,4 @@ The default `batch_size: 16` in `cfs/pscn_seg.yml` works on all Colab GPUs. ## Session persistence -Colab wipes `/content` on disconnect but Drive persists. The notebook copies the best checkpoint to Drive after each run; cell 8 shows how to restore it in a new session for evaluation. +Colab wipes `/content` on disconnect but Drive persists. The notebook copies the best checkpoint to Drive after each training run; section 10 shows how to restore it in a new session for evaluation. diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index f0072fa..1d840e6 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -82,30 +82,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 4. Mount Drive and unpack dataset\n", - "\n", - "### Data layout expected on Drive\n", - "Recommended: zip the prepared dataset and store it on Drive.\n", - "\n", - "```\n", - "/MyDrive/PCBSegClassNet/\n", - " data.zip ← zipped contents of data/ (segmentation/, classification/)\n", - " checkpoints/ ← (optional, for resume / saved best models)\n", - "```\n", - "\n", - "Inside `data.zip` the structure should match what `create_patches.py` produced:\n", - "```\n", - "segmentation/train/images/*.png\n", - "segmentation/train/masks/*.png\n", - "segmentation/val/images/*.png\n", - "segmentation/val/masks/*.png\n", - "classification/train//*.png\n", - "classification/val//*.png\n", - "```\n", - "\n", - "Why unzip to local disk and not stream from Drive? Drive mounts thousands of small files extremely slowly (API throttling). Always unpack to `/content` for training." - ] + "source": "## 4. Mount Drive and unpack the raw FPIC archive\n\nThis notebook does the **entire data prep pipeline** (mask generation + patches + train/val split) in Colab so you only need to upload the raw FPIC images + annotations (~7 GB) instead of the processed dataset (~18 GB).\n\n### Data layout expected on Drive\nZip the **raw** FPIC images + annotations together and store on Drive:\n\n```\n/MyDrive/PCBSegClassNet/\n data_raw.zip ← contains: pcb_image/*.png + smd_annotation/*.csv\n checkpoints/ ← (optional, for resume / saved best models)\n```\n\nTo make the zip on a Windows host:\n\n```powershell\nCompress-Archive -Path data\\pcb_image, data\\smd_annotation -DestinationPath data_raw.zip -Force\n```\n\nWhy unzip to local disk and not stream from Drive? Drive mounts thousands of small files extremely slowly (API throttling). Always unpack to `/content` for training." }, { "cell_type": "code", @@ -122,22 +99,31 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Adjust path if you stored data.zip elsewhere\n", - "DATA_ZIP = \"/content/drive/MyDrive/PCBSegClassNet/data.zip\"\n", - "\n", - "import os, time\n", - "assert os.path.exists(DATA_ZIP), f\"Not found: {DATA_ZIP}\"\n", - "\n", - "%cd /content/PCBSegClassNet\n", - "!mkdir -p data\n", - "t0 = time.time()\n", - "!unzip -q -o {DATA_ZIP} -d data/\n", - "print(f\"Unzip done in {time.time()-t0:.1f}s\")\n", - "\n", - "!echo \"--- segmentation ---\"; ls data/segmentation/ 2>/dev/null\n", - "!echo \"--- classification ---\"; ls data/classification/ 2>/dev/null" - ] + "source": "# Adjust path if you stored the raw zip elsewhere\nRAW_ZIP = \"/content/drive/MyDrive/PCBSegClassNet/data_raw.zip\"\n\nimport os, time\nassert os.path.exists(RAW_ZIP), f\"Not found: {RAW_ZIP}\"\n\n%cd /content/PCBSegClassNet\n!mkdir -p data\nt0 = time.time()\n!unzip -q -o {RAW_ZIP} -d data/\nprint(f\"Unzip done in {time.time()-t0:.1f}s\")\n\n!echo \"--- pcb_image:\"; ls data/pcb_image/ | wc -l\n!echo \"--- smd_annotation:\"; ls data/smd_annotation/ | wc -l" + }, + { + "cell_type": "markdown", + "source": "## 5. Generate masks + classification crops (`create_mask.py`)\n\nRuns through all annotation CSVs, fills polygon masks per component class, and writes:\n- `data/segmentation/images/` — HSI + CLAHE preprocessed PCB images\n- `data/segmentation/masks/` — RGB masks (color-encoded per class)\n- `data/classification/images//` — individual component crops upscaled with the EDSR super-resolution model in `checkpoints/super_resolution.h5`\n\nGPU-accelerated via the EDSR forward pass. Expect ~10–30 minutes depending on Colab GPU (A100 fastest).", + "metadata": {} + }, + { + "cell_type": "code", + "source": "%cd /content/PCBSegClassNet/src/data\n!python create_mask.py \\\n -i ../../data/pcb_image/ \\\n -a ../../data/smd_annotation/ \\\n -id ../../data/segmentation/images \\\n -ad ../../data/segmentation/masks \\\n -cd ../../data/classification/images/\n\n!echo \"--- segmentation/images: $(ls ../../data/segmentation/images 2>/dev/null | wc -l)\"\n!echo \"--- segmentation/masks: $(ls ../../data/segmentation/masks 2>/dev/null | wc -l)\"\n!echo \"--- classification crops total: $(find ../../data/classification/images -type f | wc -l)\"\n!echo \"--- classification classes:\"; ls ../../data/classification/images 2>/dev/null", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": "## 6. Cut 768 px patches and split into train/val (`create_patches.py`)\n\nCuts the full PCB images + masks into 768×768 patches and moves the patches + classification crops into `train/` and `val/` subfolders (80/20 split). Pure CPU work, ~5 minutes.\n\nAfter this cell, the dataset layout matches what the training scripts expect:\n\n```\ndata/segmentation/train/{images,masks}/*.png\ndata/segmentation/val/{images,masks}/*.png\ndata/classification/train//*.png\ndata/classification/val//*.png\n```", + "metadata": {} + }, + { + "cell_type": "markdown", + "source": "## 7. (Optional) Mirror checkpoints to Drive for persistence\n\nColab local disk is wiped on session end. Save best model files back to Drive at the end of training (or set up a callback). For now, just record the path.", + "metadata": {}, + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -149,14 +135,11 @@ ] }, { - "cell_type": "code", + "cell_type": "markdown", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "DRIVE_CKPT_DIR = \"/content/drive/MyDrive/PCBSegClassNet/checkpoints\"\n", - "!mkdir -p {DRIVE_CKPT_DIR}" - ] + "source": "## 8. Train segmentation\n\nDefault config in `cfs/pscn_seg.yml` is `batch_size=16`, `epochs` controlled by `-epoch`.\n\n**First run a 5-epoch sanity pass.** If loss is finite and val_dice_coef is improving, kick off the full 100 epochs." }, { "cell_type": "markdown", @@ -174,10 +157,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "%cd /content/PCBSegClassNet/src\n", - "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 5" - ] + "source": "# Full training run\n%cd /content/PCBSegClassNet/src\n!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 40" }, { "cell_type": "code", @@ -191,15 +171,11 @@ ] }, { - "cell_type": "code", + "cell_type": "markdown", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Backup the best seg checkpoint to Drive\n", - "!cp /content/PCBSegClassNet/checkpoints/best_seg.h5 {DRIVE_CKPT_DIR}/best_seg.h5\n", - "!ls -la {DRIVE_CKPT_DIR}" - ] + "source": "## 9. Train classification" }, { "cell_type": "markdown", @@ -213,10 +189,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "%cd /content/PCBSegClassNet/src\n", - "!python train_classification.py -opt cfs/pscn_class.yml -epoch 5" - ] + "source": "%cd /content/PCBSegClassNet/src\n!python train_classification.py -opt cfs/pscn_class.yml -epoch 40" }, { "cell_type": "code", @@ -229,15 +202,11 @@ ] }, { - "cell_type": "code", + "cell_type": "markdown", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Backup the best classification checkpoint to Drive\n", - "!cp /content/PCBSegClassNet/checkpoints/best_class.h5 {DRIVE_CKPT_DIR}/best_class.h5\n", - "!ls -la {DRIVE_CKPT_DIR}" - ] + "source": "## 10. (Optional) Evaluate without retraining\n\nPass `-epoch 0` to skip training; the script will load `best_*.h5` from `checkpoints/` and run `model.evaluate(val_dataset)`. Make sure the checkpoint is in `/content/PCBSegClassNet/checkpoints/` (copy it back from Drive if you reconnected)." }, { "cell_type": "markdown", From 912eb40df49ea960632ff68760b4df33506b5c3e Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Sun, 10 May 2026 17:46:41 +0900 Subject: [PATCH 05/14] fix(colab): pin condacolab to Python 3.10 explicitly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The latest condacolab defaults to Python 3.11, which TF 2.10 also has no wheels for (only 3.7–3.10). Pass python_version="3.10" so the kernel restart lands on a Python 3.10 base that the TF 2.10 install can match. Co-Authored-By: Claude Opus 4.7 --- notebooks/colab_train.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index 1d840e6..2a5e4f8 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -56,14 +56,14 @@ { "cell_type": "markdown", "metadata": {}, - "source": "## 3. Set up Python 3.10 + TF 2.10 stack\n\nColab's default Python is now 3.12, but TF 2.10 only ships wheels for Python 3.7–3.10. Use `condacolab` to swap the runtime to a Python 3.10 base, then pin the TF 2.10 stack on top.\n\n**Why TF 2.10?** This codebase calls `tf.keras.backend.dot` / `backend.transpose` and a couple of other APIs that broke in Keras 3 (TF 2.16+). TF 2.10 is verified to run end-to-end.\n\n> ⚠️ The next cell **restarts the kernel** automatically. Wait for it to reconnect, then continue with the cells after it. The cloned repo at `/content/PCBSegClassNet` survives the restart (it lives on Colab's local disk)." + "source": "## 3. Set up Python 3.10 + TF 2.10 stack\n\nColab's default Python is now 3.12, but TF 2.10 only ships wheels for Python 3.7–3.10. Use `condacolab` to swap the runtime to a Python 3.10 base (the latest condacolab defaults to 3.11, which TF 2.10 also doesn't support — pin it explicitly), then install the TF 2.10 stack on top.\n\n**Why TF 2.10?** This codebase calls `tf.keras.backend.dot` / `backend.transpose` and a couple of other APIs that broke in Keras 3 (TF 2.16+). TF 2.10 is verified to run end-to-end.\n\n> ⚠️ The next cell **restarts the kernel** automatically. Wait for it to reconnect, then continue with the cells after it. The cloned repo at `/content/PCBSegClassNet` survives the restart (it lives on Colab's local disk)." }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "!pip install -q condacolab\nimport condacolab\ncondacolab.install() # kernel restarts automatically — rerun cells below afterwards" + "source": "!pip install -q condacolab\nimport condacolab\ncondacolab.install(python_version=\"3.10\") # kernel restarts; rerun cells below afterwards" }, { "cell_type": "code", From e7448ae9ec5c7b7cf5acbc644218e42b8bcc4b97 Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Sun, 10 May 2026 18:02:20 +0900 Subject: [PATCH 06/14] docs(colab): switch to TF 2.15 + clean notebook duplicates Drop the condacolab Python 3.10 dance. Colab's default Python keeps moving past TF 2.10's wheel matrix (now 3.11/3.12), and the latest condacolab doesn't accept python_version on install_miniforge. TF 2.15 is the last TF release on Keras 2 (Keras 3 starts at TF 2.16) and ships wheels for the Python versions Colab actually serves, so the codebase's tf.keras.backend.{dot,transpose} usage keeps working with no source changes. Also rewrites the notebook from scratch to clean up duplicate cells that crept in during incremental NotebookEdit changes (two ## 6 / ## 7 sections, both 100- and 40-epoch training cells, missing sanity cells). Co-Authored-By: Claude Opus 4.7 --- notebooks/README.md | 22 ++-- notebooks/colab_train.ipynb | 200 +++++++++++++++++++++++++++--------- 2 files changed, 168 insertions(+), 54 deletions(-) diff --git a/notebooks/README.md b/notebooks/README.md index aa5b648..3fb3559 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -14,7 +14,7 @@ ``` https://colab.research.google.com/github//PCBSegClassNet/blob/colab/notebooks/colab_train.ipynb ``` -4. **Runtime → Change runtime type → GPU**, then run cells top to bottom. +4. **Runtime → Change runtime type → GPU** (T4 is enough; High-RAM not needed), then run cells top to bottom. ## What the notebook does @@ -22,12 +22,12 @@ |---|---| | 1 | `nvidia-smi` GPU sanity | | 2 | Clone this repo (`colab` branch) | -| 3 | Swap kernel to Python 3.10 base via `condacolab`, then pin TF 2.10.1 + matching keras / protobuf / numpy. Colab's default Python 3.12 has no TF 2.10 wheels, and this codebase isn't compatible with Keras 3 (TF 2.16+) | +| 3 | Install TF 2.15 + dependencies (TF 2.15 is the last release on Keras 2; Keras 3 from TF 2.16+ breaks this codebase's `tf.keras.backend.{dot,transpose}` calls) | | 4 | Mount Drive, unzip `data_raw.zip` to local Colab disk | | 5 | `create_mask.py` — polygon masks + classification crops (EDSR super-resolution, GPU) | | 6 | `create_patches.py` — 768 px patches + 80/20 train/val split (CPU) | | 7 | Set up Drive checkpoint directory for persistence across sessions | -| 8 | Segmentation training (5 epochs sanity → 100 epochs full → mirror checkpoint to Drive) | +| 8 | Segmentation training (5 epochs sanity → 40 epochs full → mirror checkpoint to Drive) | | 9 | Classification training (same pattern) | | 10 | Optional: re-evaluate from Drive checkpoints in a fresh session | @@ -39,11 +39,11 @@ If you already have a processed dataset zip, you can skip cells 5–6 and unzip it directly into `data/` instead. -## Why TF 2.10 specifically? +## Why TF 2.15? -- This repo uses `tf.keras.activations.softmax(tensor)` and `tf.keras.backend.{dot,transpose}` patterns that broke in Keras 3. -- TF 2.10 was the last release with native Windows GPU; verified to work end-to-end. -- Colab's bundled TF (2.15+ with Keras 3) can produce `AttributeError`s on import without changes to the codebase. +- This repo uses `tf.keras.backend.dot` / `backend.transpose` and `tf.keras.activations.softmax(tensor)` patterns that broke in Keras 3. +- TF 2.15 is the **last TF release on Keras 2**; Keras 3 starts at TF 2.16. +- Earlier this notebook tried to pin TF 2.10 via `condacolab`, but Colab's base Python keeps moving past 3.10 and TF 2.10's wheel matrix doesn't follow. TF 2.15 ships wheels for the Python versions Colab actually serves. ## VRAM notes @@ -56,6 +56,14 @@ If you already have a processed dataset zip, you can skip cells 5–6 and unzip The default `batch_size: 16` in `cfs/pscn_seg.yml` works on all Colab GPUs. +## Epoch budget + +The notebook runs: +- **Sanity 5 epochs** before each full run, so you catch NaN losses or OOMs in <1 hour. +- **Full 40 epochs** for both segmentation and classification. + +40 + 40 ≈ 12 hours on a T4, which fits inside Colab Pro's 24 h session limit. If validation metrics are still improving at epoch 40, restore the best checkpoint and run more epochs (incremental training is supported by `-epoch`). + ## Session persistence Colab wipes `/content` on disconnect but Drive persists. The notebook copies the best checkpoint to Drive after each training run; section 10 shows how to restore it in a new session for evaluation. diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index 2a5e4f8..16f4b6d 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -6,14 +6,14 @@ "source": [ "# PCBSegClassNet — Colab Training\n", "\n", - "Train PCBSegNet (segmentation) and PCBClassNet (classification) on Google Colab GPU.\n", + "End-to-end pipeline on Google Colab GPU: data preprocessing (mask generation + patches + train/val split) → segmentation training → classification training.\n", "\n", - "**Why Colab?** Local 8 GB GPU (e.g. RTX 4060 Ti) is too tight for `batch=16` at 512×512 input — decoder activation alone is ~4 GB. Colab T4 (16 GB) or A100 (40 GB) handles it comfortably.\n", + "**Why Colab?** Local 8 GB GPUs (e.g. RTX 4060 Ti) are too tight for `batch=16` at 512×512 input — the segmentation decoder activation alone is ~4 GB. Colab T4 (16 GB) and above handle it comfortably.\n", "\n", "## Before you run\n", - "1. **Runtime → Change runtime type → GPU** (T4 / A100 / L4 — whatever you have).\n", - "2. Have your dataset ready as a zip in Google Drive (see *Data layout* below).\n", - "3. Mount Drive when prompted in the relevant cell." + "1. **Runtime → Change runtime type → GPU** (T4 is enough; High-RAM not needed).\n", + "2. Have `data_raw.zip` ready in Drive at `MyDrive/PCBSegClassNet/data_raw.zip` (~7 GB; contains `pcb_image/` + `smd_annotation/`).\n", + "3. Mount Drive when prompted in §4." ] }, { @@ -38,7 +38,7 @@ "source": [ "## 2. Clone the repo\n", "\n", - "If you forked it, change the URL to your fork." + "If you forked the repo, change the URL to your fork." ] }, { @@ -56,33 +56,58 @@ { "cell_type": "markdown", "metadata": {}, - "source": "## 3. Set up Python 3.10 + TF 2.10 stack\n\nColab's default Python is now 3.12, but TF 2.10 only ships wheels for Python 3.7–3.10. Use `condacolab` to swap the runtime to a Python 3.10 base (the latest condacolab defaults to 3.11, which TF 2.10 also doesn't support — pin it explicitly), then install the TF 2.10 stack on top.\n\n**Why TF 2.10?** This codebase calls `tf.keras.backend.dot` / `backend.transpose` and a couple of other APIs that broke in Keras 3 (TF 2.16+). TF 2.10 is verified to run end-to-end.\n\n> ⚠️ The next cell **restarts the kernel** automatically. Wait for it to reconnect, then continue with the cells after it. The cloned repo at `/content/PCBSegClassNet` survives the restart (it lives on Colab's local disk)." + "source": [ + "## 3. Install TF 2.15 + dependencies\n", + "\n", + "Use TF 2.15 — the last release on Keras 2 (Keras 3 starts at TF 2.16, which breaks `tf.keras.backend.{dot,transpose}` and other patterns this codebase relies on). TF 2.15 has wheels for Colab's default Python so no conda gymnastics needed." + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "!pip install -q condacolab\nimport condacolab\ncondacolab.install(python_version=\"3.10\") # kernel restarts; rerun cells below afterwards" + "source": [ + "!pip install -q tensorflow==2.15.0 albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "!pip install -q \\\n tensorflow==2.10.1 \\\n keras==2.10.0 \\\n tensorflow-estimator==2.10.0 \\\n protobuf==3.19.6 \\\n numpy==1.24.4\n\n!pip install -q albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn" - }, - { - "cell_type": "code", - "source": "import sys, tensorflow as tf\nprint(\"Python:\", sys.version.split()[0])\nprint(\"TF:\", tf.__version__)\nprint(\"GPU:\", tf.config.list_physical_devices(\"GPU\"))", - "metadata": {}, - "execution_count": null, - "outputs": [] + "source": [ + "import sys, tensorflow as tf\n", + "print(\"Python:\", sys.version.split()[0])\n", + "print(\"TF:\", tf.__version__)\n", + "print(\"GPU:\", tf.config.list_physical_devices(\"GPU\"))" + ] }, { "cell_type": "markdown", "metadata": {}, - "source": "## 4. Mount Drive and unpack the raw FPIC archive\n\nThis notebook does the **entire data prep pipeline** (mask generation + patches + train/val split) in Colab so you only need to upload the raw FPIC images + annotations (~7 GB) instead of the processed dataset (~18 GB).\n\n### Data layout expected on Drive\nZip the **raw** FPIC images + annotations together and store on Drive:\n\n```\n/MyDrive/PCBSegClassNet/\n data_raw.zip ← contains: pcb_image/*.png + smd_annotation/*.csv\n checkpoints/ ← (optional, for resume / saved best models)\n```\n\nTo make the zip on a Windows host:\n\n```powershell\nCompress-Archive -Path data\\pcb_image, data\\smd_annotation -DestinationPath data_raw.zip -Force\n```\n\nWhy unzip to local disk and not stream from Drive? Drive mounts thousands of small files extremely slowly (API throttling). Always unpack to `/content` for training." + "source": [ + "## 4. Mount Drive and unpack the raw FPIC archive\n", + "\n", + "This notebook does the **entire data prep pipeline** (mask generation + patches + train/val split) in Colab so you only need to upload the raw FPIC images + annotations (~7 GB) instead of the processed dataset (~18 GB).\n", + "\n", + "### Data layout expected on Drive\n", + "Zip the **raw** FPIC images + annotations together:\n", + "\n", + "```\n", + "/MyDrive/PCBSegClassNet/\n", + " data_raw.zip ← contains: pcb_image/*.png + smd_annotation/*.csv\n", + " checkpoints/ ← (optional, for resume / saved best models)\n", + "```\n", + "\n", + "To make the zip on a Windows host:\n", + "\n", + "```powershell\n", + "Compress-Archive -Path data\\pcb_image, data\\smd_annotation -DestinationPath data_raw.zip -Force\n", + "```\n", + "\n", + "Why unzip to local disk and not stream from Drive? Drive mounts thousands of small files extremely slowly (API throttling). Always unpack to `/content` for training." + ] }, { "cell_type": "code", @@ -99,57 +124,121 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Adjust path if you stored the raw zip elsewhere\nRAW_ZIP = \"/content/drive/MyDrive/PCBSegClassNet/data_raw.zip\"\n\nimport os, time\nassert os.path.exists(RAW_ZIP), f\"Not found: {RAW_ZIP}\"\n\n%cd /content/PCBSegClassNet\n!mkdir -p data\nt0 = time.time()\n!unzip -q -o {RAW_ZIP} -d data/\nprint(f\"Unzip done in {time.time()-t0:.1f}s\")\n\n!echo \"--- pcb_image:\"; ls data/pcb_image/ | wc -l\n!echo \"--- smd_annotation:\"; ls data/smd_annotation/ | wc -l" + "source": [ + "RAW_ZIP = \"/content/drive/MyDrive/PCBSegClassNet/data_raw.zip\"\n", + "\n", + "import os, time\n", + "assert os.path.exists(RAW_ZIP), f\"Not found: {RAW_ZIP}\"\n", + "\n", + "%cd /content/PCBSegClassNet\n", + "!mkdir -p data\n", + "t0 = time.time()\n", + "!unzip -q -o {RAW_ZIP} -d data/\n", + "print(f\"Unzip done in {time.time()-t0:.1f}s\")\n", + "\n", + "!echo \"--- pcb_image:\"; ls data/pcb_image/ | wc -l\n", + "!echo \"--- smd_annotation:\"; ls data/smd_annotation/ | wc -l" + ] }, { "cell_type": "markdown", - "source": "## 5. Generate masks + classification crops (`create_mask.py`)\n\nRuns through all annotation CSVs, fills polygon masks per component class, and writes:\n- `data/segmentation/images/` — HSI + CLAHE preprocessed PCB images\n- `data/segmentation/masks/` — RGB masks (color-encoded per class)\n- `data/classification/images//` — individual component crops upscaled with the EDSR super-resolution model in `checkpoints/super_resolution.h5`\n\nGPU-accelerated via the EDSR forward pass. Expect ~10–30 minutes depending on Colab GPU (A100 fastest).", - "metadata": {} + "metadata": {}, + "source": [ + "## 5. Generate masks + classification crops (`create_mask.py`)\n", + "\n", + "Runs through all annotation CSVs, fills polygon masks per component class, and writes:\n", + "- `data/segmentation/images/` — HSI + CLAHE preprocessed PCB images\n", + "- `data/segmentation/masks/` — RGB masks (color-encoded per class)\n", + "- `data/classification/images//` — individual component crops upscaled with the EDSR super-resolution model in `checkpoints/super_resolution.h5`\n", + "\n", + "GPU-accelerated via the EDSR forward pass. Expect ~10–30 minutes depending on Colab GPU." + ] }, { "cell_type": "code", - "source": "%cd /content/PCBSegClassNet/src/data\n!python create_mask.py \\\n -i ../../data/pcb_image/ \\\n -a ../../data/smd_annotation/ \\\n -id ../../data/segmentation/images \\\n -ad ../../data/segmentation/masks \\\n -cd ../../data/classification/images/\n\n!echo \"--- segmentation/images: $(ls ../../data/segmentation/images 2>/dev/null | wc -l)\"\n!echo \"--- segmentation/masks: $(ls ../../data/segmentation/masks 2>/dev/null | wc -l)\"\n!echo \"--- classification crops total: $(find ../../data/classification/images -type f | wc -l)\"\n!echo \"--- classification classes:\"; ls ../../data/classification/images 2>/dev/null", - "metadata": {}, "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [ + "%cd /content/PCBSegClassNet/src/data\n", + "!python create_mask.py \\\n", + " -i ../../data/pcb_image/ \\\n", + " -a ../../data/smd_annotation/ \\\n", + " -id ../../data/segmentation/images \\\n", + " -ad ../../data/segmentation/masks \\\n", + " -cd ../../data/classification/images/\n", + "\n", + "!echo \"--- segmentation/images: $(ls ../../data/segmentation/images 2>/dev/null | wc -l)\"\n", + "!echo \"--- segmentation/masks: $(ls ../../data/segmentation/masks 2>/dev/null | wc -l)\"\n", + "!echo \"--- classification crops total: $(find ../../data/classification/images -type f | wc -l)\"\n", + "!echo \"--- classification classes:\"; ls ../../data/classification/images 2>/dev/null" + ] }, { "cell_type": "markdown", - "source": "## 6. Cut 768 px patches and split into train/val (`create_patches.py`)\n\nCuts the full PCB images + masks into 768×768 patches and moves the patches + classification crops into `train/` and `val/` subfolders (80/20 split). Pure CPU work, ~5 minutes.\n\nAfter this cell, the dataset layout matches what the training scripts expect:\n\n```\ndata/segmentation/train/{images,masks}/*.png\ndata/segmentation/val/{images,masks}/*.png\ndata/classification/train//*.png\ndata/classification/val//*.png\n```", - "metadata": {} + "metadata": {}, + "source": [ + "## 6. Cut 768 px patches and split into train/val (`create_patches.py`)\n", + "\n", + "Cuts the full PCB images + masks into 768×768 patches and moves the patches + classification crops into `train/` and `val/` subfolders (80/20 split). Pure CPU work, ~5 minutes.\n", + "\n", + "After this cell, the dataset layout matches what the training scripts expect:\n", + "\n", + "```\n", + "data/segmentation/train/{images,masks}/*.png\n", + "data/segmentation/val/{images,masks}/*.png\n", + "data/classification/train//*.png\n", + "data/classification/val//*.png\n", + "```" + ] }, { - "cell_type": "markdown", - "source": "## 7. (Optional) Mirror checkpoints to Drive for persistence\n\nColab local disk is wiped on session end. Save best model files back to Drive at the end of training (or set up a callback). For now, just record the path.", - "metadata": {}, + "cell_type": "code", "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [ + "%cd /content/PCBSegClassNet/src/data\n", + "!python create_patches.py \\\n", + " -i ../../data/segmentation/images/ \\\n", + " -m ../../data/segmentation/masks \\\n", + " -cd ../../data/classification/images/ \\\n", + " -ps 768\n", + "\n", + "!echo \"--- seg train: $(ls ../../data/segmentation/train/images 2>/dev/null | wc -l) images / $(ls ../../data/segmentation/train/masks 2>/dev/null | wc -l) masks\"\n", + "!echo \"--- seg val: $(ls ../../data/segmentation/val/images 2>/dev/null | wc -l) images / $(ls ../../data/segmentation/val/masks 2>/dev/null | wc -l) masks\"\n", + "!echo \"--- class train: $(find ../../data/classification/train -type f 2>/dev/null | wc -l) crops\"\n", + "!echo \"--- class val: $(find ../../data/classification/val -type f 2>/dev/null | wc -l) crops\"" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 5. (Optional) Mirror checkpoints to Drive for persistence\n", + "## 7. (Optional) Mirror checkpoints to Drive for persistence\n", "\n", - "Colab local disk is wiped on session end. Save best model files back to Drive at the end of training (or set up a callback). For now, just record the path." + "Colab local disk is wiped on session end. Save best model files back to Drive at the end of training. For now, just record the path." ] }, { - "cell_type": "markdown", + "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "## 8. Train segmentation\n\nDefault config in `cfs/pscn_seg.yml` is `batch_size=16`, `epochs` controlled by `-epoch`.\n\n**First run a 5-epoch sanity pass.** If loss is finite and val_dice_coef is improving, kick off the full 100 epochs." + "source": [ + "DRIVE_CKPT_DIR = \"/content/drive/MyDrive/PCBSegClassNet/checkpoints\"\n", + "!mkdir -p {DRIVE_CKPT_DIR}" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 6. Train segmentation\n", + "## 8. Train segmentation\n", "\n", "Default config in `cfs/pscn_seg.yml` is `batch_size=16`, `epochs` controlled by `-epoch`.\n", "\n", - "**First run a 5-epoch sanity pass.** If loss is finite and val_dice_coef is improving, kick off the full 100 epochs." + "**First run a 5-epoch sanity pass.** If loss is finite and val_dice_coef is improving, kick off the full 40 epochs." ] }, { @@ -157,7 +246,11 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Full training run\n%cd /content/PCBSegClassNet/src\n!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 40" + "source": [ + "# Sanity check: 5 epochs\n", + "%cd /content/PCBSegClassNet/src\n", + "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 5" + ] }, { "cell_type": "code", @@ -165,23 +258,27 @@ "metadata": {}, "outputs": [], "source": [ - "# Full training run\n", + "# Full training run (40 epochs)\n", "%cd /content/PCBSegClassNet/src\n", - "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 100" + "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 40" ] }, { - "cell_type": "markdown", + "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "## 9. Train classification" + "source": [ + "# Backup the best seg checkpoint to Drive\n", + "!cp /content/PCBSegClassNet/checkpoints/best_seg.h5 {DRIVE_CKPT_DIR}/best_seg.h5\n", + "!ls -la {DRIVE_CKPT_DIR}" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 7. Train classification" + "## 9. Train classification" ] }, { @@ -189,7 +286,11 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "%cd /content/PCBSegClassNet/src\n!python train_classification.py -opt cfs/pscn_class.yml -epoch 40" + "source": [ + "# Sanity check: 5 epochs\n", + "%cd /content/PCBSegClassNet/src\n", + "!python train_classification.py -opt cfs/pscn_class.yml -epoch 5" + ] }, { "cell_type": "code", @@ -197,22 +298,27 @@ "metadata": {}, "outputs": [], "source": [ + "# Full training run (40 epochs)\n", "%cd /content/PCBSegClassNet/src\n", - "!python train_classification.py -opt cfs/pscn_class.yml -epoch 100" + "!python train_classification.py -opt cfs/pscn_class.yml -epoch 40" ] }, { - "cell_type": "markdown", + "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "## 10. (Optional) Evaluate without retraining\n\nPass `-epoch 0` to skip training; the script will load `best_*.h5` from `checkpoints/` and run `model.evaluate(val_dataset)`. Make sure the checkpoint is in `/content/PCBSegClassNet/checkpoints/` (copy it back from Drive if you reconnected)." + "source": [ + "# Backup the best classification checkpoint to Drive\n", + "!cp /content/PCBSegClassNet/checkpoints/best_class.h5 {DRIVE_CKPT_DIR}/best_class.h5\n", + "!ls -la {DRIVE_CKPT_DIR}" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 8. (Optional) Evaluate without retraining\n", + "## 10. (Optional) Evaluate without retraining\n", "\n", "Pass `-epoch 0` to skip training; the script will load `best_*.h5` from `checkpoints/` and run `model.evaluate(val_dataset)`. Make sure the checkpoint is in `/content/PCBSegClassNet/checkpoints/` (copy it back from Drive if you reconnected)." ] @@ -250,4 +356,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From 012893e89ee096a26261756980cca861e16dc833 Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Sun, 10 May 2026 18:16:30 +0900 Subject: [PATCH 07/14] fix(colab): use tensorflow[and-cuda] extra so TF 2.15 sees the GPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plain `pip install tensorflow==2.15.0` on Colab falls back to CPU because Colab's bundled CUDA libs are pinned to whatever TF version Colab ships, not 2.15. The `[and-cuda]` extra pulls in matching nvidia-cudnn-cu12 / cublas-cu12 / etc. wheels alongside TF, which is what TF's GPU loader actually expects to dlopen. Without this, training falls back to CPU and create_mask.py / train_*.py take ~10× longer with periodic "Cannot dlopen some GPU libraries" warnings in stderr. Co-Authored-By: Claude Opus 4.7 --- notebooks/colab_train.ipynb | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index 16f4b6d..73c2d7c 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -56,20 +56,14 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 3. Install TF 2.15 + dependencies\n", - "\n", - "Use TF 2.15 — the last release on Keras 2 (Keras 3 starts at TF 2.16, which breaks `tf.keras.backend.{dot,transpose}` and other patterns this codebase relies on). TF 2.15 has wheels for Colab's default Python so no conda gymnastics needed." - ] + "source": "## 3. Install TF 2.15 + dependencies\n\nUse TF 2.15 — the last release on Keras 2 (Keras 3 starts at TF 2.16, which breaks `tf.keras.backend.{dot,transpose}` and other patterns this codebase relies on). TF 2.15 has wheels for Colab's default Python so no conda gymnastics needed.\n\n> 🔑 The `tensorflow[and-cuda]==2.15.0` extra pulls in matching `nvidia-cudnn-cu12` / `nvidia-cublas-cu12` / etc. wheels. Plain `tensorflow==2.15.0` falls back to CPU on Colab because the bundled CUDA stack is for a different TF version.\n\nAfter this cell finishes, **Runtime → Restart session**, then run the verification cell below to confirm the GPU is visible." }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "!pip install -q tensorflow==2.15.0 albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn" - ] + "source": "!pip install -q \"tensorflow[and-cuda]==2.15.0\" albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn" }, { "cell_type": "code", @@ -356,4 +350,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file From 67a7d97f3fc09c8a797b41e64b05ccdb2e497ecc Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Sun, 10 May 2026 18:34:41 +0900 Subject: [PATCH 08/14] fix(colab): install nvidia CUDA wheels directly (and-cuda extra is broken) tensorflow[and-cuda]==2.15.0 fails to resolve because the extra pins tensorrt-libs==8.6.1, which has been removed from PyPI (only 9.x is still available). Drop the bracket extra and install nvidia-cudnn-cu12, nvidia-cublas-cu12, etc. by name in a separate pip call. TF needs them at dlopen time but doesn't actually use TensorRT for training. Co-Authored-By: Claude Opus 4.7 --- notebooks/colab_train.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index 73c2d7c..99a28b9 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -56,14 +56,14 @@ { "cell_type": "markdown", "metadata": {}, - "source": "## 3. Install TF 2.15 + dependencies\n\nUse TF 2.15 — the last release on Keras 2 (Keras 3 starts at TF 2.16, which breaks `tf.keras.backend.{dot,transpose}` and other patterns this codebase relies on). TF 2.15 has wheels for Colab's default Python so no conda gymnastics needed.\n\n> 🔑 The `tensorflow[and-cuda]==2.15.0` extra pulls in matching `nvidia-cudnn-cu12` / `nvidia-cublas-cu12` / etc. wheels. Plain `tensorflow==2.15.0` falls back to CPU on Colab because the bundled CUDA stack is for a different TF version.\n\nAfter this cell finishes, **Runtime → Restart session**, then run the verification cell below to confirm the GPU is visible." + "source": "## 3. Install TF 2.15 + dependencies\n\nUse TF 2.15 — the last release on Keras 2 (Keras 3 starts at TF 2.16, which breaks `tf.keras.backend.{dot,transpose}` and other patterns this codebase relies on). TF 2.15 has wheels for Colab's default Python so no conda gymnastics needed.\n\nThe cell below installs TF and the non-CUDA python deps in one pip call, then installs the NVIDIA CUDA libraries TF dlopens at runtime in a second call.\n\n> 🔑 We can't use `tensorflow[and-cuda]==2.15.0` here — that extra pins `tensorrt-libs==8.6.1`, which is no longer available on PyPI (only 9.x remains). Installing the cudnn / cublas / cuda-runtime / etc. wheels directly is enough; TensorRT is only needed for `tf.experimental.tensorrt` inference, not training.\n\nAfter this cell finishes, **Runtime → Restart session**, then run the verification cell to confirm the GPU is visible." }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "!pip install -q \"tensorflow[and-cuda]==2.15.0\" albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn" + "source": "# TF + non-CUDA python deps\n!pip install -q tensorflow==2.15.0 albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn\n\n# CUDA libs (the `tensorflow[and-cuda]` extra is broken because it pins\n# tensorrt-libs==8.6.1 which is no longer on PyPI; install the libs TF\n# actually needs to dlopen at runtime instead — TensorRT is not used by\n# training, only by tf.experimental.tensorrt inference).\n!pip install -q nvidia-cudnn-cu12 nvidia-cublas-cu12 nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nccl-cu12" }, { "cell_type": "code", From 28a6665dbde89544e304b1c5dd415fe7a3811bfd Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Sun, 10 May 2026 18:57:51 +0900 Subject: [PATCH 09/14] fix(colab): pin install + run to system python3.11 binary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Colab's notebook kernel runs on Python 3.12, but TF 2.15 only ships wheels for Python 3.9–3.11. Colab images already include /usr/local/bin/python3.11; install the TF 2.15 stack into that interpreter and run create_mask.py / create_patches.py / train_*.py via !python3.11 instead of !python. The notebook kernel itself stays on Python 3.12 — we never import tensorflow from kernel cells, just shell-out to python3.11 for everything that touches TF. Co-Authored-By: Claude Opus 4.7 --- notebooks/colab_train.ipynb | 40 ++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index 99a28b9..d9c4216 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -56,14 +56,28 @@ { "cell_type": "markdown", "metadata": {}, - "source": "## 3. Install TF 2.15 + dependencies\n\nUse TF 2.15 — the last release on Keras 2 (Keras 3 starts at TF 2.16, which breaks `tf.keras.backend.{dot,transpose}` and other patterns this codebase relies on). TF 2.15 has wheels for Colab's default Python so no conda gymnastics needed.\n\nThe cell below installs TF and the non-CUDA python deps in one pip call, then installs the NVIDIA CUDA libraries TF dlopens at runtime in a second call.\n\n> 🔑 We can't use `tensorflow[and-cuda]==2.15.0` here — that extra pins `tensorrt-libs==8.6.1`, which is no longer available on PyPI (only 9.x remains). Installing the cudnn / cublas / cuda-runtime / etc. wheels directly is enough; TensorRT is only needed for `tf.experimental.tensorrt` inference, not training.\n\nAfter this cell finishes, **Runtime → Restart session**, then run the verification cell to confirm the GPU is visible." + "source": [ + "## 3. Install TF 2.15 + dependencies into Python 3.11\n", + "\n", + "TF 2.15 is the last release on Keras 2 (Keras 3 starts at TF 2.16, which breaks `tf.keras.backend.{dot,transpose}` and other patterns this codebase relies on). But TF 2.15 wheels only target Python 3.9–3.11, while Colab's notebook kernel runs on Python 3.12.\n", + "\n", + "Workaround: Colab images already ship a system `python3.11` binary at `/usr/local/bin/python3.11`. Install TF 2.15 + deps **into that interpreter** and run all training scripts via `!python3.11 ...`. The notebook kernel itself stays on 3.12 — that's fine, we never `import tensorflow` from it.\n", + "\n", + "> 🔑 We can't use `tensorflow[and-cuda]==2.15.0` here — that extra pins `tensorrt-libs==8.6.1`, which is no longer available on PyPI (only 9.x remains). Installing the cudnn / cublas / cuda-runtime / etc. wheels directly is enough; TensorRT is only needed for `tf.experimental.tensorrt` inference, not training." + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "# TF + non-CUDA python deps\n!pip install -q tensorflow==2.15.0 albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn\n\n# CUDA libs (the `tensorflow[and-cuda]` extra is broken because it pins\n# tensorrt-libs==8.6.1 which is no longer on PyPI; install the libs TF\n# actually needs to dlopen at runtime instead — TensorRT is not used by\n# training, only by tf.experimental.tensorrt inference).\n!pip install -q nvidia-cudnn-cu12 nvidia-cublas-cu12 nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nccl-cu12" + "source": [ + "# Python 3.11 already exists on Colab; install our stack into it.\n", + "!python3.11 -m pip install -q tensorflow==2.15.0 albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn\n", + "\n", + "# CUDA libs TF needs to dlopen at runtime.\n", + "!python3.11 -m pip install -q nvidia-cudnn-cu12 nvidia-cublas-cu12 nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nccl-cu12" + ] }, { "cell_type": "code", @@ -71,10 +85,8 @@ "metadata": {}, "outputs": [], "source": [ - "import sys, tensorflow as tf\n", - "print(\"Python:\", sys.version.split()[0])\n", - "print(\"TF:\", tf.__version__)\n", - "print(\"GPU:\", tf.config.list_physical_devices(\"GPU\"))" + "# Verify TF + GPU under Python 3.11 (the interpreter that will actually run training)\n", + "!python3.11 -c \"import sys, tensorflow as tf; print('Python:', sys.version.split()[0]); print('TF:', tf.__version__); print('Keras:', tf.keras.__version__); print('GPU:', tf.config.list_physical_devices('GPU'))\"" ] }, { @@ -155,7 +167,7 @@ "outputs": [], "source": [ "%cd /content/PCBSegClassNet/src/data\n", - "!python create_mask.py \\\n", + "!python3.11 create_mask.py \\\n", " -i ../../data/pcb_image/ \\\n", " -a ../../data/smd_annotation/ \\\n", " -id ../../data/segmentation/images \\\n", @@ -193,7 +205,7 @@ "outputs": [], "source": [ "%cd /content/PCBSegClassNet/src/data\n", - "!python create_patches.py \\\n", + "!python3.11 create_patches.py \\\n", " -i ../../data/segmentation/images/ \\\n", " -m ../../data/segmentation/masks \\\n", " -cd ../../data/classification/images/ \\\n", @@ -243,7 +255,7 @@ "source": [ "# Sanity check: 5 epochs\n", "%cd /content/PCBSegClassNet/src\n", - "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 5" + "!python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 5" ] }, { @@ -254,7 +266,7 @@ "source": [ "# Full training run (40 epochs)\n", "%cd /content/PCBSegClassNet/src\n", - "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 40" + "!python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 40" ] }, { @@ -283,7 +295,7 @@ "source": [ "# Sanity check: 5 epochs\n", "%cd /content/PCBSegClassNet/src\n", - "!python train_classification.py -opt cfs/pscn_class.yml -epoch 5" + "!python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 5" ] }, { @@ -294,7 +306,7 @@ "source": [ "# Full training run (40 epochs)\n", "%cd /content/PCBSegClassNet/src\n", - "!python train_classification.py -opt cfs/pscn_class.yml -epoch 40" + "!python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 40" ] }, { @@ -329,7 +341,7 @@ "!cp {DRIVE_CKPT_DIR}/best_class.h5 /content/PCBSegClassNet/checkpoints/ 2>/dev/null || echo 'no class ckpt'\n", "\n", "%cd /content/PCBSegClassNet/src\n", - "!python train_segmentation.py -opt cfs/pscn_seg.yml -epoch 0" + "!python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 0" ] } ], @@ -350,4 +362,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From db984dd15c6a80600ac74a689071f6d59c69f1d2 Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Sun, 10 May 2026 20:46:50 +0900 Subject: [PATCH 10/14] fix(colab): set TF_GPU_ALLOCATOR=cuda_malloc_async on every training cell DISLoss's SSIM gradient backward path spikes a 416 MB tensor ([batch=16, 26 classes, 512, 512]) that fragments allocator on T4 16 GB GPUs and OOMs even though plenty of free memory exists. TF itself recommends `cuda_malloc_async` in this case. Add it as a prefix to every train/eval invocation so the recommendation actually fires; on L4 24 GB it's redundant but harmless. Co-Authored-By: Claude Opus 4.7 --- notebooks/colab_train.ipynb | 36 ++++++------------------------------ 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index d9c4216..47b5f06 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -252,22 +252,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Sanity check: 5 epochs\n", - "%cd /content/PCBSegClassNet/src\n", - "!python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 5" - ] + "source": "# Sanity check: 5 epochs\n# TF_GPU_ALLOCATOR=cuda_malloc_async reduces fragmentation OOMs on 16 GB GPUs\n# (the SSIM gradient in DISLoss spikes a 416 MB tensor that can fail to fit\n# even on T4 16 GB without async allocator). On L4 24 GB it's belt-and-braces.\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 5" }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Full training run (40 epochs)\n", - "%cd /content/PCBSegClassNet/src\n", - "!python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 40" - ] + "source": "# Full training run (40 epochs)\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 40" }, { "cell_type": "code", @@ -292,22 +284,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Sanity check: 5 epochs\n", - "%cd /content/PCBSegClassNet/src\n", - "!python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 5" - ] + "source": "# Sanity check: 5 epochs\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 5" }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Full training run (40 epochs)\n", - "%cd /content/PCBSegClassNet/src\n", - "!python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 40" - ] + "source": "# Full training run (40 epochs)\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 40" }, { "cell_type": "code", @@ -334,15 +318,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Restore checkpoints from Drive after a fresh session\n", - "!mkdir -p /content/PCBSegClassNet/checkpoints\n", - "!cp {DRIVE_CKPT_DIR}/best_seg.h5 /content/PCBSegClassNet/checkpoints/ 2>/dev/null || echo 'no seg ckpt'\n", - "!cp {DRIVE_CKPT_DIR}/best_class.h5 /content/PCBSegClassNet/checkpoints/ 2>/dev/null || echo 'no class ckpt'\n", - "\n", - "%cd /content/PCBSegClassNet/src\n", - "!python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 0" - ] + "source": "# Restore checkpoints from Drive after a fresh session\n!mkdir -p /content/PCBSegClassNet/checkpoints\n!cp {DRIVE_CKPT_DIR}/best_seg.h5 /content/PCBSegClassNet/checkpoints/ 2>/dev/null || echo 'no seg ckpt'\n!cp {DRIVE_CKPT_DIR}/best_class.h5 /content/PCBSegClassNet/checkpoints/ 2>/dev/null || echo 'no class ckpt'\n\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 0" } ], "metadata": { @@ -362,4 +338,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file From a3dcde5d8944d491b914970a9c7584bc2836b022 Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Sun, 10 May 2026 20:53:58 +0900 Subject: [PATCH 11/14] fix(colab): install python3.11 if missing + pin CUDA wheel versions L4 Colab images don't ship python3.11 (T4 ones do). Add a guard that installs python3.11 from deadsnakes PPA when it's missing. Pin every nvidia-*-cu12 wheel to the version TF 2.15 expects to dlopen: - nvidia-cudnn-cu12==8.9.4.25 (latest is 9.x; TF 2.15 needs libcudnn.so.8) - nvidia-cublas-cu12==12.2.5.6 etc. Without these pins TF 2.15 falls back to CPU on a fresh runtime because it can't find the right .so versions, and the warnings are easy to miss. Co-Authored-By: Claude Opus 4.7 --- notebooks/colab_train.ipynb | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index 47b5f06..8a3a3a0 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -71,13 +71,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Python 3.11 already exists on Colab; install our stack into it.\n", - "!python3.11 -m pip install -q tensorflow==2.15.0 albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn\n", - "\n", - "# CUDA libs TF needs to dlopen at runtime.\n", - "!python3.11 -m pip install -q nvidia-cudnn-cu12 nvidia-cublas-cu12 nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nccl-cu12" - ] + "source": "# If python3.11 doesn't exist on this Colab image (e.g. L4 base), install it first.\n# T4 base already has /usr/local/bin/python3.11 so this is a no-op there.\n!command -v python3.11 >/dev/null || { \\\n apt-get update -q && \\\n apt-get install -y software-properties-common && \\\n add-apt-repository -y ppa:deadsnakes/ppa && \\\n apt-get update -q && \\\n apt-get install -y python3.11 python3.11-dev python3.11-distutils && \\\n curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11; \\\n}\n\n# TF + non-CUDA python deps\n!python3.11 -m pip install -q tensorflow==2.15.0 albumentations==1.4.18 opencv-python-headless pyyaml tqdm pandas scikit-learn\n\n# CUDA libs pinned to versions matching TF 2.15. The latest nvidia-cudnn-cu12 is\n# 9.x which TF 2.15 cannot dlopen (it links against libcudnn.so.8 specifically),\n# so version pinning is mandatory here.\n!python3.11 -m pip install -q \\\n nvidia-cudnn-cu12==8.9.4.25 \\\n nvidia-cublas-cu12==12.2.5.6 \\\n nvidia-cuda-cupti-cu12==12.2.142 \\\n nvidia-cuda-nvrtc-cu12==12.2.140 \\\n nvidia-cuda-runtime-cu12==12.2.140 \\\n nvidia-cufft-cu12==11.0.8.103 \\\n nvidia-curand-cu12==10.3.3.141 \\\n nvidia-cusolver-cu12==11.5.2.141 \\\n nvidia-cusparse-cu12==12.1.2.141 \\\n nvidia-nccl-cu12==2.16.5" }, { "cell_type": "code", From d988186d0c3023f0d46c81f187b3b117063bb6b2 Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Sun, 10 May 2026 22:40:56 +0900 Subject: [PATCH 12/14] config(colab): bump full training to 80 epochs each (was 40) 40 was too aggressive a cut from the paper's 100. 80 is the sweet spot: enough room for ReduceLROnPlateau (patience=15) to fire and fine-tune, while still fitting inside Colab Pro's 24 h session limit (~9h per model on L4 = 18h total + preprocessing buffer). Co-Authored-By: Claude Opus 4.7 --- notebooks/README.md | 6 +++--- notebooks/colab_train.ipynb | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/notebooks/README.md b/notebooks/README.md index 3fb3559..f1ee2f8 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -27,7 +27,7 @@ | 5 | `create_mask.py` — polygon masks + classification crops (EDSR super-resolution, GPU) | | 6 | `create_patches.py` — 768 px patches + 80/20 train/val split (CPU) | | 7 | Set up Drive checkpoint directory for persistence across sessions | -| 8 | Segmentation training (5 epochs sanity → 40 epochs full → mirror checkpoint to Drive) | +| 8 | Segmentation training (5 epochs sanity → 80 epochs full → mirror checkpoint to Drive) | | 9 | Classification training (same pattern) | | 10 | Optional: re-evaluate from Drive checkpoints in a fresh session | @@ -60,9 +60,9 @@ The default `batch_size: 16` in `cfs/pscn_seg.yml` works on all Colab GPUs. The notebook runs: - **Sanity 5 epochs** before each full run, so you catch NaN losses or OOMs in <1 hour. -- **Full 40 epochs** for both segmentation and classification. +- **Full 80 epochs** for both segmentation and classification. -40 + 40 ≈ 12 hours on a T4, which fits inside Colab Pro's 24 h session limit. If validation metrics are still improving at epoch 40, restore the best checkpoint and run more epochs (incremental training is supported by `-epoch`). +80 + 80 ≈ 18 hours on an L4, fitting inside Colab Pro's 24 h session limit with margin. The original paper trained for 100 epochs; 80 leaves a safety buffer for the inevitable Drive-mount / preprocessing time at the start of a session. If you want closer to paper-faithful runs, push to 100 once you've seen one full run complete. ## Session persistence diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index 8a3a3a0..3c02f0a 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -253,7 +253,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Full training run (40 epochs)\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 40" + "source": "# Full training run (80 epochs)\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg.yml -epoch 80" }, { "cell_type": "code", @@ -285,7 +285,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Full training run (40 epochs)\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 40" + "source": "# Full training run (80 epochs)\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_classification.py -opt cfs/pscn_class.yml -epoch 80" }, { "cell_type": "code", From 563c182b4d45536a01586be34744e1bbe82916d2 Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Mon, 11 May 2026 20:56:46 +0900 Subject: [PATCH 13/14] feat(seg): -resume flag + fine-tune config for second-stage training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first 80-epoch segmentation run lands val_dice around 0.71 with train dice 0.92 (clear overfit) and lr already at min_lr=1e-5. Add a second optional stage that resumes from best_seg.h5 with a lower lr range so ReduceLROnPlateau can keep stepping down past 1e-5. Changes: - train_segmentation.py: -resume CLI flag; when set, model.load_weights is called on the configured checkpoint path before fit(). - src/cfs/pscn_seg_finetune.yml: same architecture as pscn_seg.yml but lr=1e-5 (where the first run left off) and min_lr=1e-6. - notebooks/colab_train.ipynb: new §8b that restores best_seg.h5 from Drive if missing, runs 20 epochs with -resume + the finetune config, then re-mirrors the best checkpoint. - .gitignore: ignore /best_*.h5 and root-level *.zip (Colab artifacts that landed in the working tree). Co-Authored-By: Claude Opus 4.7 --- .gitignore | 7 +++ notebooks/colab_train.ipynb | 19 ++++++++ src/cfs/pscn_seg_finetune.yml | 86 +++++++++++++++++++++++++++++++++++ src/train_segmentation.py | 10 ++++ 4 files changed, 122 insertions(+) create mode 100644 src/cfs/pscn_seg_finetune.yml diff --git a/.gitignore b/.gitignore index d4519f0..655922b 100755 --- a/.gitignore +++ b/.gitignore @@ -93,3 +93,10 @@ target/ # Training logs /logs/*.log + +# Model checkpoints downloaded from Colab (super_resolution.h5 in checkpoints/ +# is already tracked; this only catches root-level .h5 backups) +/best_*.h5 + +# Data archives at repo root +/*.zip diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index 3c02f0a..b593027 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -266,6 +266,25 @@ "!ls -la {DRIVE_CKPT_DIR}" ] }, + { + "cell_type": "markdown", + "source": "### 8b. (Optional) Fine-tune segmentation from best checkpoint\n\nThe first full run leaves `lr` at the `min_lr` (1e-5) of `ReduceLROnPlateau`. To squeeze more out of the model, resume from `best_seg.h5` with `pscn_seg_finetune.yml` — same architecture but `lr=1e-5` start and `min_lr=1e-6` so the plateau callback can step down further.\n\nIf you're running this in a fresh session, the first cell below restores `best_seg.h5` from Drive into Colab local disk (the resume flag loads weights from there).", + "metadata": {} + }, + { + "cell_type": "code", + "source": "# Backup fine-tuned best_seg.h5 to Drive (ModelCheckpoint overwrites in place\n# whenever val_dice_coef improves, so this captures the best of the two runs)\n!cp /content/PCBSegClassNet/checkpoints/best_seg.h5 {DRIVE_CKPT_DIR}/best_seg.h5\n!ls -la {DRIVE_CKPT_DIR}", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": "# Make sure best_seg.h5 exists locally (restore from Drive if fresh session)\n!mkdir -p /content/PCBSegClassNet/checkpoints\n!test -f /content/PCBSegClassNet/checkpoints/best_seg.h5 || cp {DRIVE_CKPT_DIR}/best_seg.h5 /content/PCBSegClassNet/checkpoints/\n!ls -la /content/PCBSegClassNet/checkpoints/best_seg.h5\n\n# Fine-tune: 20 epochs, lr=1e-5 → min_lr=1e-6, resume from best_seg.h5\n%cd /content/PCBSegClassNet/src\n!TF_GPU_ALLOCATOR=cuda_malloc_async python3.11 train_segmentation.py -opt cfs/pscn_seg_finetune.yml -epoch 20 -resume", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/cfs/pscn_seg_finetune.yml b/src/cfs/pscn_seg_finetune.yml new file mode 100644 index 0000000..7ca6ddc --- /dev/null +++ b/src/cfs/pscn_seg_finetune.yml @@ -0,0 +1,86 @@ +# ------------------------------------------------------------------------ +# Copyright (c) 2023 CandleLabAI. All Rights Reserved. +# ------------------------------------------------------------------------ +# Fine-tune config: same model, smaller learning rate. Use with -resume so +# weights are loaded from best_seg.h5 before training continues. +# general settings +name: PCBSegClassNet +model_type: SegmentationModel + +datasets: + train: + name: FPIC + type: Segmentation + data_images: ../data/segmentation/train/images/ + data_masks: ../data/segmentation/train/masks/ + + img_size_h: 512 + img_size_w: 512 + + # data loader + use_shuffle: true + batch_size: 16 + + val: + name: FPIC + type: Segmentation + data_images: ../data/segmentation/val/images/ + data_masks: ../data/segmentation/val/masks/ + + img_size_h: 512 + img_size_w: 512 + + use_shuffle: false + batch_size: 1 + +# path +path: + checkpoint_network: ../checkpoints/best_seg.h5 + log_file: ../logs/app.log + +# training settings +train: + optim: + type: Adam + lr: 0.00001 # 1e-5 — start where the previous run left off + betas: [0.9, 0.9] + + callbacks: + modelcheckpoint: + type: ModelCheckpoint + monitor: val_dice_coef + mode: max + verbose: 1 + save_best_only: true + save_weights_only: false + + reducelronplateau: + type: ReduceLROnPlateau + monitor: val_loss + mode: min + verbose: 1 + factor: 0.1 + patience: 15 + min_lr: 0.000001 # 1e-6 — allow finer adjustments than the initial run + + num_classes: 25 + + # losses + loss: + type: DISLoss + + # metrics + metric: + DICE: + type: dice_coef + IoU: + type: jacard_coef + +# val settings +val: + # metrics + metric: + DICE: + type: DiceCoef + IoU: + type: IoU diff --git a/src/train_segmentation.py b/src/train_segmentation.py index d51e2bb..fbda5d3 100755 --- a/src/train_segmentation.py +++ b/src/train_segmentation.py @@ -34,10 +34,14 @@ def parse_config(): type=int, default=1, help="number of epochs.") + parser.add_argument("-resume", + action="store_true", + help="resume training from existing best checkpoint.") args = parser.parse_args() opt = parse(args.opt) opt["train"]["total_epochs"] = args.epoch + opt["train"]["resume"] = args.resume return opt def init_log(opt): @@ -110,6 +114,12 @@ def main(): ) ) + # resume from existing checkpoint if requested + import os + if opt["train"].get("resume") and os.path.exists(opt["path"]["checkpoint_network"]): + logger.info(f"Resuming from {opt['path']['checkpoint_network']}") + model.load_weights(opt["path"]["checkpoint_network"]) + # training model if opt["train"]["total_epochs"] > 0: logger.info(f"Training for {opt['train']['total_epochs']} epochs") From 13107833063f34cb37428d8b18d3bd4bf4a4266f Mon Sep 17 00:00:00 2001 From: ironmanizawesome Date: Mon, 11 May 2026 21:12:15 +0900 Subject: [PATCH 14/14] fix(colab): drop tf.keras.__version__ from verify cell TF 2.15 lazy-loads tf.keras, and accessing __version__ on it raises AttributeError mid-cell, swallowing the GPU print that follows. Print TF version + GPU list only; users who specifically need the keras version can run it in a separate cell. Co-Authored-By: Claude Opus 4.7 --- notebooks/colab_train.ipynb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/notebooks/colab_train.ipynb b/notebooks/colab_train.ipynb index b593027..962f7e3 100644 --- a/notebooks/colab_train.ipynb +++ b/notebooks/colab_train.ipynb @@ -78,10 +78,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Verify TF + GPU under Python 3.11 (the interpreter that will actually run training)\n", - "!python3.11 -c \"import sys, tensorflow as tf; print('Python:', sys.version.split()[0]); print('TF:', tf.__version__); print('Keras:', tf.keras.__version__); print('GPU:', tf.config.list_physical_devices('GPU'))\"" - ] + "source": "# Verify TF + GPU under Python 3.11 (the interpreter that will actually run training).\n# `tf.keras.__version__` raises AttributeError on TF 2.15 due to a lazy_loader quirk,\n# so we deliberately don't print it here. Skip-tested: works on both T4 and L4 images.\n!python3.11 -c \"import sys, tensorflow as tf; print('Python:', sys.version.split()[0]); print('TF:', tf.__version__); print('GPU:', tf.config.list_physical_devices('GPU'))\"" }, { "cell_type": "markdown",