Skip to content
49 changes: 49 additions & 0 deletions examples/qwen3_8b_opd_tillicum/00_pull_or_load_container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
source "${SCRIPT_DIR}/env.sh"

APPTAINER_BIN="${APPTAINER_BIN:-}"
if [[ -z "${APPTAINER_BIN}" ]]; then
if command -v apptainer >/dev/null 2>&1; then
APPTAINER_BIN=apptainer
elif command -v singularity >/dev/null 2>&1; then
APPTAINER_BIN=singularity
else
echo "Neither apptainer nor singularity is available on PATH." >&2
exit 1
fi
fi

mkdir -p "$(dirname "${SLIME_SIF}")" "${APPTAINER_CACHEDIR}" "${APPTAINER_TMPDIR}"
export APPTAINER_CACHEDIR APPTAINER_TMPDIR

cat <<EOF
Container setup
image uri: ${SLIME_IMAGE_URI}
format : ${SLIME_CONTAINER_FORMAT}
path : ${SLIME_SIF}
cache : ${APPTAINER_CACHEDIR}
tmp : ${APPTAINER_TMPDIR}
EOF

if [[ -e "${SLIME_SIF}" && "${FORCE_PULL:-0}" != "1" ]]; then
echo "Container already exists. Set FORCE_PULL=1 and remove/replace it manually if needed."
exit 0
fi

if [[ -e "${SLIME_SIF}" && "${FORCE_PULL:-0}" == "1" ]]; then
echo "Refusing to overwrite existing ${SLIME_SIF} automatically." >&2
echo "Move it aside or remove it manually, then rerun this script." >&2
exit 1
fi

if [[ "${SLIME_CONTAINER_FORMAT}" == "sandbox" ]]; then
"${APPTAINER_BIN}" build --sandbox "${SLIME_SIF}" "${SLIME_IMAGE_URI}"
else
"${APPTAINER_BIN}" pull --force "${SLIME_SIF}" "${SLIME_IMAGE_URI}"
fi

"${APPTAINER_BIN}" exec --cleanenv "${SLIME_SIF}" python -V
echo "Wrote and validated ${SLIME_SIF}"
81 changes: 81 additions & 0 deletions examples/qwen3_8b_opd_tillicum/01_prepare_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
source "${SCRIPT_DIR}/env.sh"

mkdir -p \
"${SCRATCH_ROOT}" \
"${DATA_ROOT}" \
"${MODEL_ROOT}" \
"${OUTPUT_ROOT}" \
"${HF_HOME}" \
"${HF_DATASETS_CACHE}" \
"${TRANSFORMERS_CACHE}" \
"${WANDB_DIR}" \
"${TMPDIR}" \
"${RAY_TMPDIR}" \
"${APPTAINER_CACHEDIR}" \
"${APPTAINER_TMPDIR}" \
"$(dirname "${SLIME_SIF}")" \
"${CONTAINER_HOME}" \
"${SFT_SAVE_DIR}" \
"${OPD_SAVE_DIR}" \
"${SFT_HF_SNAPSHOT_DIR}" \
"${OPD_HF_SNAPSHOT_DIR}" \
"${SFT_DETAILS_DIR}" \
"${OPD_ROLLOUT_LOG_DIR}" \
"${TEACHER_LOG_DIR}" \
"${EVAL_OUTPUT_DIR}" \
"${BASE_EVAL_OUTPUT_DIR}" \
"${SFT_EVAL_OUTPUT_DIR}" \
"${OPD_EVAL_OUTPUT_DIR}" \
"${CHECKPOINT_REPORT_DIR}" \
"${SLURM_LOG_DIR}"

cat <<EOF
Tillicum Qwen3-8B SFT + OPD environment

Slurm
ACCOUNT=${ACCOUNT}
PARTITION=${PARTITION}
QOS=${QOS}
resources per job: 1 node, 8 x H200, 64 CPUs, all node memory

Container
SLIME_IMAGE_URI=${SLIME_IMAGE_URI}
SLIME_CONTAINER_FORMAT=${SLIME_CONTAINER_FORMAT}
SLIME_SIF=${SLIME_SIF}
bind roots=${CONTAINER_BIND_ROOTS}

Data
OT3_DATASET=${OT3_DATASET}
filter=${DATA_MATH_FIELD} == ${DATA_MATH_VALUE}
seed=${DATA_SEED}
SFT_PARQUET=${SFT_PARQUET}
OPD_JSONL=${OPD_JSONL}
SPLIT_METADATA=${SPLIT_METADATA}
MATH500_JSONL=${MATH500_JSONL}

Models
student=${STUDENT_HF_REPO}
teacher=${TEACHER_HF_REPO}
STUDENT_HF_DIR=${STUDENT_HF_DIR}
TEACHER_HF_DIR=${TEACHER_HF_DIR}
STUDENT_TORCH_DIST_DIR=${STUDENT_TORCH_DIST_DIR}

Outputs
SFT_SAVE_DIR=${SFT_SAVE_DIR}
OPD_SAVE_DIR=${OPD_SAVE_DIR}
SFT_HF_SNAPSHOT_DIR=${SFT_HF_SNAPSHOT_DIR}
OPD_HF_SNAPSHOT_DIR=${OPD_HF_SNAPSHOT_DIR}
EVAL_OUTPUT_DIR=${EVAL_OUTPUT_DIR}
BASE_EVAL_OUTPUT_DIR=${BASE_EVAL_OUTPUT_DIR}
SFT_EVAL_OUTPUT_DIR=${SFT_EVAL_OUTPUT_DIR}
OPD_EVAL_OUTPUT_DIR=${OPD_EVAL_OUTPUT_DIR}
CHECKPOINT_REPORT_DIR=${CHECKPOINT_REPORT_DIR}
SLURM_LOG_DIR=${SLURM_LOG_DIR}

This script created directories only. It did not download data/models,
pull containers, install packages, submit jobs, delete files, or push commits.
EOF
114 changes: 114 additions & 0 deletions examples/qwen3_8b_opd_tillicum/02_prepare_data_25k_10k.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/bin/bash
#SBATCH --job-name=slime-qwen3-data25k10k
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --gres=gpu:h200:1
#SBATCH --mem=128G
#SBATCH --time=02:00:00
#SBATCH --mail-user=suryadv@cs.washington.edu
#SBATCH --mail-type=END,FAIL

set -euo pipefail

if [[ -n "${SLURM_SUBMIT_DIR:-}" && -f "${SLURM_SUBMIT_DIR}/examples/qwen3_8b_opd_tillicum/env.sh" ]]; then
SCRIPT_DIR="${SLURM_SUBMIT_DIR}/examples/qwen3_8b_opd_tillicum"
else
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
fi
source "${SCRIPT_DIR}/env.sh"

mkdir -p "${SLURM_LOG_DIR}" "${DATA_ROOT}"
if [[ -n "${SLURM_JOB_ID:-}" ]]; then
export RAY_TMPDIR="/tmp/${USER:-suryadv}/slime-ray-${SLURM_JOB_ID}"
mkdir -p "${RAY_TMPDIR}"
exec >"${SLURM_LOG_DIR}/${SLURM_JOB_NAME}_${SLURM_JOB_ID}.log" 2>&1
fi

echo "Preparing OpenThoughts3 math data at $(date)"
echo "SFT rows: ${SFT_SIZE} -> ${SFT_PARQUET}"
echo "OPD rows: ${OPD_SIZE} -> ${OPD_JSONL}"
echo "Metadata: ${SPLIT_METADATA}"

"${SCRIPT_DIR}/container_exec.sh" bash -lc '
set -euo pipefail
cd "${SLIME_REPO_ROOT}"

needs_prepare=0
for path in "${SFT_PARQUET}" "${OPD_JSONL}" "${SPLIT_METADATA}"; do
if [[ ! -s "${path}" ]]; then
needs_prepare=1
fi
done

if [[ "${needs_prepare}" == "0" ]]; then
if ! python3 - <<PY
import json
import os
from pathlib import Path

sft_path = Path(os.environ["SFT_PARQUET"])
opd_path = Path(os.environ["OPD_JSONL"])
metadata_path = Path(os.environ["SPLIT_METADATA"])
sft_expected = int(os.environ["SFT_SIZE"])
opd_expected = int(os.environ["OPD_SIZE"])

def count_jsonl(path: Path) -> int:
with path.open("rb") as f:
return sum(1 for _ in f)

metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
sft_count = count_jsonl(sft_path)
opd_count = count_jsonl(opd_path)
sft_ids = set(metadata["source_row_ids"]["sft"])
opd_ids = set(metadata["source_row_ids"]["opd"])

if sft_count != sft_expected or opd_count != opd_expected or sft_ids.intersection(opd_ids):
raise SystemExit(1)
PY
then
echo "Existing data files do not match the requested split; regenerating."
needs_prepare=1
fi
fi

if [[ "${needs_prepare}" == "1" ]]; then
python3 examples/qwen3_8b_opd_tillicum/02_prepare_openthoughts3_math_sample.py --force
else
echo "Data files already exist; validating counts."
fi

python3 - <<PY
import json
import os
from pathlib import Path

sft_path = Path(os.environ["SFT_PARQUET"])
opd_path = Path(os.environ["OPD_JSONL"])
metadata_path = Path(os.environ["SPLIT_METADATA"])
sft_expected = int(os.environ["SFT_SIZE"])
opd_expected = int(os.environ["OPD_SIZE"])

def count_jsonl(path: Path) -> int:
with path.open("rb") as f:
return sum(1 for _ in f)

sft_count = count_jsonl(sft_path)
opd_count = count_jsonl(opd_path)
if sft_count != sft_expected:
raise SystemExit(f"SFT row count mismatch: {sft_count} != {sft_expected}")
if opd_count != opd_expected:
raise SystemExit(f"OPD row count mismatch: {opd_count} != {opd_expected}")

metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
sft_ids = set(metadata["source_row_ids"]["sft"])
opd_ids = set(metadata["source_row_ids"]["opd"])
overlap = sft_ids.intersection(opd_ids)
if overlap:
raise SystemExit(f"SFT/OPD metadata overlap detected: {sorted(overlap)[:10]}")

print(f"DATA_PREP_OK sft_rows={sft_count} opd_rows={opd_count} metadata={metadata_path}")
PY
'

echo "Finished data prep at $(date)"
Loading
Loading