From e5c8baf736c3cf2a496a66410ee499d037f32738 Mon Sep 17 00:00:00 2001 From: anxi01 Date: Tue, 12 May 2026 20:05:48 +0900 Subject: [PATCH 01/13] ci: add EC2 deployment workflow Add a Docker-based production deployment path for the FastAPI API and Postgres on a single EC2 instance. Document required GitHub secrets and make the settings/test setup compatible with production compose environment values. --- .dockerignore | 11 ++++ .env.example | 19 ++++++ .github/workflows/deploy-ec2.yml | 108 +++++++++++++++++++++++++++++++ Dockerfile | 22 +++++++ README.md | 19 ++++++ app/core/config.py | 2 +- docker-compose.prod.yml | 49 ++++++++++++++ tests/test_public_api.py | 32 +++++---- 8 files changed, 248 insertions(+), 14 deletions(-) create mode 100644 .dockerignore create mode 100644 .env.example create mode 100644 .github/workflows/deploy-ec2.yml create mode 100644 Dockerfile create mode 100644 docker-compose.prod.yml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..6598ca0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +.git +.github +.pytest_cache +.venv +__pycache__ +*.py[cod] +*.egg-info +.env +linko-dev.db +dist +build diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9d5a234 --- /dev/null +++ b/.env.example @@ -0,0 +1,19 @@ +DATABASE_URL= +POSTGRES_DB= +POSTGRES_USER= +POSTGRES_PASSWORD= +API_PORT= + +JWT_SECRET_KEY= +JWT_ALGORITHM= +JWT_ACCESS_TOKEN_EXPIRE_MINUTES= + +GOOGLE_CLIENT_ID= +GOOGLE_CLIENT_SECRET= +YOUTUBE_API_KEY= + +AI_PROVIDER= +GEMINI_API_KEY= +GEMINI_MODEL= + +CORS_ORIGINS= \ No newline at end of file diff --git a/.github/workflows/deploy-ec2.yml b/.github/workflows/deploy-ec2.yml new file mode 100644 index 0000000..8daf2e3 --- /dev/null +++ b/.github/workflows/deploy-ec2.yml @@ -0,0 +1,108 @@ +name: Deploy to EC2 + +on: + pull_request: + push: + branches: + - main + workflow_dispatch: + +concurrency: + group: deploy-ec2-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + name: Test + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + cache: pip + + - name: Install dependencies + run: python -m pip install -e ".[dev]" + + - name: Run tests + run: python -m pytest -v + + deploy: + name: Deploy + runs-on: ubuntu-latest + needs: test + if: github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' + environment: production + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Configure SSH + env: + EC2_HOST: ${{ secrets.EC2_HOST }} + EC2_PORT: ${{ secrets.EC2_PORT }} + EC2_SSH_KEY: ${{ secrets.EC2_SSH_KEY }} + EC2_USER: ${{ secrets.EC2_USER }} + run: | + test -n "$EC2_HOST" + test -n "$EC2_SSH_KEY" + mkdir -p ~/.ssh + printf '%s\n' "$EC2_SSH_KEY" > ~/.ssh/linko-ec2 + chmod 600 ~/.ssh/linko-ec2 + ssh-keyscan -p "${EC2_PORT:-22}" "$EC2_HOST" >> ~/.ssh/known_hosts + { + echo "Host linko-ec2" + echo " HostName $EC2_HOST" + echo " Port ${EC2_PORT:-22}" + echo " User ${EC2_USER:-ubuntu}" + echo " IdentityFile ~/.ssh/linko-ec2" + echo " StrictHostKeyChecking yes" + } >> ~/.ssh/config + + - name: Package application + run: | + tar \ + --exclude='.git' \ + --exclude='.github' \ + --exclude='.pytest_cache' \ + --exclude='.venv' \ + --exclude='__pycache__' \ + --exclude='*.py[cod]' \ + --exclude='*.egg-info' \ + --exclude='.env' \ + -czf /tmp/linko-server.tar.gz . + + - name: Upload application + env: + DEPLOY_PATH: ${{ secrets.EC2_DEPLOY_PATH }} + PROD_ENV: ${{ secrets.PROD_ENV }} + run: | + ssh linko-ec2 "mkdir -p '${DEPLOY_PATH:-/opt/linko-server}'" + scp /tmp/linko-server.tar.gz "linko-ec2:${DEPLOY_PATH:-/opt/linko-server}/release.tar.gz" + if [ -n "$PROD_ENV" ]; then + printf '%s\n' "$PROD_ENV" > /tmp/linko.env + scp /tmp/linko.env "linko-ec2:${DEPLOY_PATH:-/opt/linko-server}/.env" + fi + + - name: Restart services + env: + DEPLOY_PATH: ${{ secrets.EC2_DEPLOY_PATH }} + run: | + ssh linko-ec2 "DEPLOY_PATH='${DEPLOY_PATH:-/opt/linko-server}' bash -s" <<'REMOTE' + set -euo pipefail + cd "$DEPLOY_PATH" + tar -xzf release.tar.gz + rm release.tar.gz + + test -f .env + docker compose -f docker-compose.prod.yml up -d --build + docker compose -f docker-compose.prod.yml exec -T api python -m alembic upgrade head + docker compose -f docker-compose.prod.yml ps + docker image prune -f + REMOTE diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..bdb22f3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.13-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y --no-install-recommends gcc libpq-dev \ + && rm -rf /var/lib/apt/lists/* + +COPY pyproject.toml ./ +COPY alembic.ini ./ +COPY alembic ./alembic +COPY app ./app + +RUN python -m pip install --upgrade pip \ + && python -m pip install . + +EXPOSE 8000 + +CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index 4fca7cb..b5bde54 100644 --- a/README.md +++ b/README.md @@ -66,3 +66,22 @@ Stop local services: ```sh docker compose down ``` + +## EC2 Deployment + +The production deployment uses GitHub Actions to copy the app to an EC2 instance +and restart Docker Compose there. Configure these GitHub environment secrets for +the `production` environment: + +- `EC2_HOST`: EC2 public host or IP address. +- `EC2_USER`: SSH user, for example `ubuntu`. +- `EC2_SSH_KEY`: private key with SSH access to the instance. +- `EC2_PORT`: optional SSH port, defaults to `22`. +- `EC2_DEPLOY_PATH`: optional deploy directory, defaults to `/opt/linko-server`. +- `PROD_ENV`: full contents of the production `.env` file. Use `.env.example` as + the template, and replace secrets before deploying. + +On the EC2 instance, install Docker and the Docker Compose plugin first. Then run +the `Deploy to EC2` workflow manually, or merge to `main` to deploy +automatically. The workflow builds the FastAPI image on EC2, starts the API and +PostgreSQL containers, and runs Alembic migrations. diff --git a/app/core/config.py b/app/core/config.py index 29a0462..5c8f6f4 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -21,7 +21,7 @@ class Settings(BaseSettings): "http://127.0.0.1:3001" ) - model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8") + model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore") @property def cors_origin_list(self) -> list[str]: diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 0000000..f679621 --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,49 @@ +services: + api: + build: + context: . + image: linko-server-api:latest + container_name: linko-api + restart: unless-stopped + env_file: + - .env + environment: + DATABASE_URL: ${DATABASE_URL:?Set DATABASE_URL in .env} + ports: + - "${API_PORT:-8000}:8000" + depends_on: + postgres: + condition: service_healthy + healthcheck: + test: + [ + "CMD-SHELL", + "python -c \"import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/api/health', timeout=5)\"", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + + postgres: + image: postgres:16-alpine + container_name: linko-postgres + restart: unless-stopped + env_file: + - .env + environment: + POSTGRES_DB: ${POSTGRES_DB:-linko} + POSTGRES_USER: ${POSTGRES_USER:-linko} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?Set POSTGRES_PASSWORD in .env} + ports: + - "127.0.0.1:${POSTGRES_HOST_PORT:-5432}:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-linko} -d ${POSTGRES_DB:-linko}"] + interval: 10s + timeout: 5s + retries: 5 + +volumes: + postgres_data: diff --git a/tests/test_public_api.py b/tests/test_public_api.py index 44fec58..7dea7a5 100644 --- a/tests/test_public_api.py +++ b/tests/test_public_api.py @@ -14,6 +14,7 @@ from fastapi.testclient import TestClient from sqlalchemy import create_engine from sqlalchemy.orm import Session, sessionmaker +from sqlalchemy.pool import StaticPool from app.db.base import Base from app.db.session import get_db @@ -28,8 +29,12 @@ TEST_DATABASE_URL = "sqlite:///:memory:" -engine = create_engine(TEST_DATABASE_URL, connect_args={"check_same_thread": False}) -TestingSessionLocal = sessionmaker(bind=engine) +engine = create_engine( + TEST_DATABASE_URL, + connect_args={"check_same_thread": False}, + poolclass=StaticPool, +) +TestingSessionLocal = sessionmaker(bind=engine, expire_on_commit=False) def override_get_db(): @@ -42,12 +47,11 @@ def override_get_db(): @pytest.fixture(autouse=True) def setup_db(): + app.dependency_overrides[get_db] = override_get_db Base.metadata.create_all(bind=engine) yield Base.metadata.drop_all(bind=engine) - - -app.dependency_overrides[get_db] = override_get_db + app.dependency_overrides.clear() client = TestClient(app) @@ -113,16 +117,18 @@ def setup_db(): } ] +_UNSET = object() + def _make_lesson( db: Session, *, is_preview: bool = True, generation_status: str = "ready", - flashcards_json: dict | None = None, - subtitles_json: dict | None = None, - watch_vocab_json: dict | None = None, - cultural_notes_json: list | None = None, + flashcards_json: dict | None | object = _UNSET, + subtitles_json: dict | None | object = _UNSET, + watch_vocab_json: dict | None | object = _UNSET, + cultural_notes_json: list | None | object = _UNSET, ) -> Lesson: lesson = Lesson( user_id=1, @@ -135,10 +141,10 @@ def _make_lesson( generation_status=generation_status, is_preview=is_preview, transcript_status="ready", - flashcards_json=flashcards_json or _FLASHCARDS_JSON, - subtitles_json=subtitles_json or _SUBTITLES_JSON, - watch_vocab_json=watch_vocab_json or _WATCH_VOCAB_JSON, - cultural_notes_json=cultural_notes_json or _CULTURAL_NOTES_JSON, + flashcards_json=_FLASHCARDS_JSON if flashcards_json is _UNSET else flashcards_json, + subtitles_json=_SUBTITLES_JSON if subtitles_json is _UNSET else subtitles_json, + watch_vocab_json=_WATCH_VOCAB_JSON if watch_vocab_json is _UNSET else watch_vocab_json, + cultural_notes_json=_CULTURAL_NOTES_JSON if cultural_notes_json is _UNSET else cultural_notes_json, raw_youtube_metadata={}, created_at=datetime.now(UTC), updated_at=datetime.now(UTC), From f9f3f1bf9fd6019749eca4fbe3b1170efc07bd04 Mon Sep 17 00:00:00 2001 From: anxi01 Date: Tue, 12 May 2026 20:25:32 +0900 Subject: [PATCH 02/13] ci: Pin EC2 host key in deployment workflow to prevent MITM attacks --- .github/workflows/deploy-ec2.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/deploy-ec2.yml b/.github/workflows/deploy-ec2.yml index 8daf2e3..f18f50a 100644 --- a/.github/workflows/deploy-ec2.yml +++ b/.github/workflows/deploy-ec2.yml @@ -49,13 +49,15 @@ jobs: EC2_PORT: ${{ secrets.EC2_PORT }} EC2_SSH_KEY: ${{ secrets.EC2_SSH_KEY }} EC2_USER: ${{ secrets.EC2_USER }} + EC2_HOST_KEY: ${{ secrets.EC2_HOST_KEY }} run: | test -n "$EC2_HOST" test -n "$EC2_SSH_KEY" + test -n "$EC2_HOST_KEY" mkdir -p ~/.ssh printf '%s\n' "$EC2_SSH_KEY" > ~/.ssh/linko-ec2 chmod 600 ~/.ssh/linko-ec2 - ssh-keyscan -p "${EC2_PORT:-22}" "$EC2_HOST" >> ~/.ssh/known_hosts + printf '%s\n' "$EC2_HOST_KEY" >> ~/.ssh/known_hosts { echo "Host linko-ec2" echo " HostName $EC2_HOST" From 63cb87eda134327f4df770176c44395265c03f66 Mon Sep 17 00:00:00 2001 From: anxi01 Date: Tue, 12 May 2026 20:30:19 +0900 Subject: [PATCH 03/13] Allow deployment from test branch --- .github/workflows/deploy-ec2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-ec2.yml b/.github/workflows/deploy-ec2.yml index f18f50a..de77d74 100644 --- a/.github/workflows/deploy-ec2.yml +++ b/.github/workflows/deploy-ec2.yml @@ -36,7 +36,7 @@ jobs: name: Deploy runs-on: ubuntu-latest needs: test - if: github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' + if: github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/codex/flashcard-aws-deploy-workflow' environment: production steps: From 61bf74381a0928e692e89a30fbaa25a92c770362 Mon Sep 17 00:00:00 2001 From: anxi01 Date: Tue, 12 May 2026 20:31:53 +0900 Subject: [PATCH 04/13] Allow deployment on pull_request for testing --- .github/workflows/deploy-ec2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-ec2.yml b/.github/workflows/deploy-ec2.yml index de77d74..af585e4 100644 --- a/.github/workflows/deploy-ec2.yml +++ b/.github/workflows/deploy-ec2.yml @@ -36,7 +36,7 @@ jobs: name: Deploy runs-on: ubuntu-latest needs: test - if: github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/codex/flashcard-aws-deploy-workflow' + if: github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' || github.ref == 'refs/heads/main' environment: production steps: From 7abdd9579cad81409d89bfb3698af1ade92c9f83 Mon Sep 17 00:00:00 2001 From: anxi01 Date: Tue, 12 May 2026 23:47:55 +0900 Subject: [PATCH 05/13] feat: Replace YouTube transcript extraction with Supadata API --- .env.example | 1 + app/core/config.py | 1 + app/services/transcripts.py | 159 +++++++++++++----------------------- tests/test_transcripts.py | 73 +++++------------ 4 files changed, 83 insertions(+), 151 deletions(-) diff --git a/.env.example b/.env.example index 9d5a234..365cc45 100644 --- a/.env.example +++ b/.env.example @@ -15,5 +15,6 @@ YOUTUBE_API_KEY= AI_PROVIDER= GEMINI_API_KEY= GEMINI_MODEL= +SUPADATA_API_KEY= CORS_ORIGINS= \ No newline at end of file diff --git a/app/core/config.py b/app/core/config.py index 5c8f6f4..7412aa0 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -14,6 +14,7 @@ class Settings(BaseSettings): ai_provider: str = "mock" gemini_api_key: str = "" gemini_model: str = "gemini-2.5-flash" + supadata_api_key: str = "" cors_origins: str = ( "http://localhost:3000," "http://127.0.0.1:3000," diff --git a/app/services/transcripts.py b/app/services/transcripts.py index b0d0df3..cc9f70f 100644 --- a/app/services/transcripts.py +++ b/app/services/transcripts.py @@ -1,9 +1,12 @@ from dataclasses import dataclass from pathlib import Path -from typing import Callable, Literal +from typing import Literal import html -import re -import subprocess + +import httpx + +from app.core.config import get_settings +from app.services.youtube import extract_video_id @dataclass(frozen=True) @@ -20,68 +23,6 @@ class TranscriptResult: segments: list[TranscriptSegment] -CommandRunner = Callable[[list[str]], subprocess.CompletedProcess[str]] - -TIMESTAMP_RE = re.compile( - r"(?P\d{2}):(?P\d{2}):(?P\d{2})[.,](?P\d{3})" -) - - -def run_command(args: list[str]) -> subprocess.CompletedProcess[str]: - return subprocess.run(args, capture_output=True, text=True, check=False) - - -def timestamp_to_seconds(value: str) -> float: - match = TIMESTAMP_RE.search(value) - if match is None: - raise ValueError(f"Invalid timestamp: {value}") - return ( - int(match.group("h")) * 3600 - + int(match.group("m")) * 60 - + int(match.group("s")) - + int(match.group("ms")) / 1000 - ) - - -def clean_caption_text(value: str) -> str: - value = re.sub(r"<[^>]+>", "", value) - value = html.unescape(value) - return re.sub(r"\s+", " ", value).strip() - - -def parse_vtt(path: Path) -> list[TranscriptSegment]: - segments: list[TranscriptSegment] = [] - current_range: tuple[float, float] | None = None - text_lines: list[str] = [] - - def flush() -> None: - nonlocal current_range, text_lines - if current_range and text_lines: - text = clean_caption_text(" ".join(text_lines)) - if text: - segments.append(TranscriptSegment(current_range[0], current_range[1], text)) - current_range = None - text_lines = [] - - for raw_line in path.read_text(encoding="utf-8", errors="replace").splitlines(): - line = raw_line.strip() - if not line: - flush() - continue - if "-->" in line: - flush() - start_raw, end_raw = [part.strip() for part in line.split("-->", 1)] - end_raw = end_raw.split(" ", 1)[0] - current_range = (timestamp_to_seconds(start_raw), timestamp_to_seconds(end_raw)) - continue - if line == "WEBVTT" or line.startswith(("Kind:", "Language:", "NOTE")): - continue - if current_range: - text_lines.append(line) - flush() - return segments - - def filter_segments( segments: list[TranscriptSegment], start_sec: float, @@ -96,48 +37,66 @@ def filter_segments( def download_youtube_captions( url: str, - output_dir: Path, + output_dir: Path, # Signature compatibility lang: str, start_sec: int, end_sec: int, allow_auto: bool = True, - runner: CommandRunner = run_command, + runner: any = None, # Signature compatibility ) -> TranscriptResult | None: - output_dir.mkdir(parents=True, exist_ok=True) - output_template = str(output_dir / "captions.%(ext)s") - args = [ - "yt-dlp", - "--skip-download", - "--sub-lang", - lang, - "--write-sub", - "--sub-format", - "vtt", - "-o", - output_template, - url, - ] - if allow_auto: - args.insert(4, "--write-auto-sub") - - result = runner(args) - if result.returncode != 0: + settings = get_settings() + if not settings.supadata_api_key: return None - vtt_files = sorted(output_dir.glob("captions*.vtt")) - if not vtt_files: + try: + video_id = extract_video_id(url) + except Exception: return None - all_segments = parse_vtt(vtt_files[0]) - scoped = filter_segments(all_segments, start_sec=start_sec, end_sec=end_sec) - text = "\n".join(item.text for item in scoped) - if len(text.strip()) < 20: + try: + response = httpx.get( + "https://api.supadata.ai/v1/youtube/transcript", + params={"videoId": video_id, "lang": lang}, + headers={"x-api-key": settings.supadata_api_key}, + timeout=30 + ) + if response.status_code != 200: + return None + + data = response.json() + content = data.get("content", []) + + segments: list[TranscriptSegment] = [] + for item in content: + # offset and duration are in ms + start = item["offset"] / 1000.0 + duration = item["duration"] / 1000.0 + end = start + duration + + # Filter by time range + if end > start_sec and start < end_sec: + segments.append( + TranscriptSegment( + start_sec=start, + end_sec=end, + text=html.unescape(item["text"]), + ) + ) + + if not segments: + return None + + full_text = "\n".join(s.text for s in segments) + if len(full_text.strip()) < 20: + return None + + # Supadata defaults to the best available transcript. + # We'll label it as youtube_caption for consistency. + return TranscriptResult( + source="youtube_caption", + text=full_text, + segments=segments + ) + + except Exception: return None - - source: Literal["youtube_caption", "youtube_auto_caption"] = ( - "youtube_caption" if ".auto." not in vtt_files[0].name else "youtube_auto_caption" - ) - if allow_auto: - source = "youtube_auto_caption" - - return TranscriptResult(source=source, text=text, segments=scoped) diff --git a/tests/test_transcripts.py b/tests/test_transcripts.py index 3abd562..10ec1d4 100644 --- a/tests/test_transcripts.py +++ b/tests/test_transcripts.py @@ -1,43 +1,11 @@ from pathlib import Path -import subprocess - +from unittest.mock import patch, MagicMock from app.services.transcripts import ( TranscriptSegment, - clean_caption_text, download_youtube_captions, filter_segments, - parse_vtt, ) - -def test_clean_caption_text_removes_vtt_markup(): - assert clean_caption_text("안녕<00:00:01.000>하세요& 반가워요") == "안녕하세요& 반가워요" - - -def test_parse_vtt_reads_caption_segments(tmp_path: Path): - path = tmp_path / "captions.ko.vtt" - path.write_text( - """WEBVTT -Kind: captions -Language: ko - -00:00:00.000 --> 00:00:02.500 -안녕하세요. - -00:00:02.500 --> 00:00:05.000 -오늘은 한국어를 공부해요. -""", - encoding="utf-8", - ) - - segments = parse_vtt(path) - - assert segments == [ - TranscriptSegment(start_sec=0.0, end_sec=2.5, text="안녕하세요."), - TranscriptSegment(start_sec=2.5, end_sec=5.0, text="오늘은 한국어를 공부해요."), - ] - - def test_filter_segments_keeps_overlapping_segments(): segments = [ TranscriptSegment(start_sec=0, end_sec=2, text="one"), @@ -49,21 +17,23 @@ def test_filter_segments_keeps_overlapping_segments(): assert filter_segments(segments, start_sec=2.1, end_sec=3.9) == [segments[1]] -def test_download_youtube_captions_uses_runner_and_parses_vtt(tmp_path: Path): - def runner(args: list[str]) -> subprocess.CompletedProcess[str]: - assert args[0] == "yt-dlp" - (tmp_path / "captions.ko.vtt").write_text( - """WEBVTT - -00:00:00.000 --> 00:00:05.000 -안녕하세요. 오늘은 서울의 길거리 음식을 함께 즐겨볼게요. - -00:00:05.000 --> 00:00:10.000 -이 시장은 현지인도 자주 와서 맛있는 음식 가게로 가득해요. -""", - encoding="utf-8", - ) - return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") +@patch("app.services.transcripts.httpx.get") +@patch("app.services.transcripts.get_settings") +def test_download_youtube_captions_success(mock_get_settings, mock_get, tmp_path: Path): + mock_settings = MagicMock() + mock_settings.supadata_api_key = "test-key" + mock_get_settings.return_value = mock_settings + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "content": [ + {"text": "안녕하세요.", "offset": 0, "duration": 5000, "lang": "ko"}, + {"text": "서울의 길거리 음식입니다.", "offset": 5000, "duration": 5000, "lang": "ko"}, + ], + "lang": "ko" + } + mock_get.return_value = mock_response transcript = download_youtube_captions( "https://youtu.be/abc123XYZ00", @@ -71,11 +41,12 @@ def runner(args: list[str]) -> subprocess.CompletedProcess[str]: lang="ko", start_sec=0, end_sec=10, - allow_auto=True, - runner=runner, + allow_auto=True ) assert transcript is not None - assert transcript.source == "youtube_auto_caption" + assert transcript.source == "youtube_caption" assert "길거리 음식" in transcript.text assert len(transcript.segments) == 2 + assert transcript.segments[0].start_sec == 0.0 + assert transcript.segments[1].end_sec == 10.0 From 2d3c0a422ede3b5774b7e938595a601a5facea9a Mon Sep 17 00:00:00 2001 From: anxi01 Date: Tue, 12 May 2026 23:56:21 +0900 Subject: [PATCH 06/13] chore: Increase Gemini API timeout to 600s --- app/services/lesson_artifacts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/services/lesson_artifacts.py b/app/services/lesson_artifacts.py index 5ab3b65..f7ecba3 100644 --- a/app/services/lesson_artifacts.py +++ b/app/services/lesson_artifacts.py @@ -189,7 +189,7 @@ def _call_gemini( "contents": [{"role": "user", "parts": [{"text": prompt}]}], "generationConfig": {"responseMimeType": "application/json"}, }, - timeout=300, + timeout=600, ) response.raise_for_status() data = response.json() From 8cb0addd778de666c5440bde2150736449b8830e Mon Sep 17 00:00:00 2001 From: anxi01 Date: Wed, 13 May 2026 00:26:44 +0900 Subject: [PATCH 07/13] feat: store bilingual subtitles before flashcard generation --- app/api/flashcards.py | 8 ++ app/api/lessons.py | 62 +++++++--- app/api/public.py | 8 ++ app/services/lesson_artifacts.py | 202 ++++++++++++++++++++----------- app/services/transcripts.py | 9 +- tests/test_lesson_artifacts.py | 51 +++++++- tests/test_lessons_api.py | 105 +++++++++++++++- tests/test_transcripts.py | 35 ++++++ 8 files changed, 382 insertions(+), 98 deletions(-) diff --git a/app/api/flashcards.py b/app/api/flashcards.py index 75b51d8..9feeb8c 100644 --- a/app/api/flashcards.py +++ b/app/api/flashcards.py @@ -38,6 +38,14 @@ def get_flashcards_for_lesson( ) if lesson.flashcards_json is not None: return lesson.flashcards_json + if lesson.error_code == "flashcard_generation_failed": + raise HTTPException( + status_code=422, + detail={ + "code": "flashcard_generation_failed", + "message": lesson.error_message or "Flashcard generation failed.", + }, + ) flashcards = get_lesson_flashcards(lesson_id) if flashcards is None: diff --git a/app/api/lessons.py b/app/api/lessons.py index ac1b777..08b1f8e 100644 --- a/app/api/lessons.py +++ b/app/api/lessons.py @@ -16,7 +16,10 @@ LessonStatusResponse, LessonSummary, ) -from app.services.lesson_artifacts import generate_lesson_artifacts_from_transcript +from app.services.lesson_artifacts import ( + build_subtitle_artifacts, + generate_lesson_artifacts_from_transcript, +) from app.services.transcripts import download_youtube_captions from app.services.youtube import ( extract_video_id, @@ -39,8 +42,8 @@ def _lesson_summary(lesson: Lesson) -> LessonSummary: duration=format_duration(lesson.duration_seconds), date=lesson.created_at.strftime("%Y.%m.%d") if lesson.created_at else None, generationStatus=lesson.generation_status, - flashcardDone=False, - subtitleDone=False, + flashcardDone=lesson.flashcards_json is not None, + subtitleDone=lesson.subtitles_json is not None, errorCode=lesson.error_code, errorMessage=lesson.error_message, ) @@ -190,14 +193,21 @@ def generate_lesson_artifacts_task(lesson_id: int) -> None: if lesson is None: return - end_sec = min(lesson.duration_seconds, 600) with TemporaryDirectory() as tmp_dir: transcript = download_youtube_captions( lesson.youtube_url, Path(tmp_dir), lang="ko", start_sec=0, - end_sec=end_sec, + end_sec=lesson.duration_seconds, + allow_auto=True, + ) + english_transcript = download_youtube_captions( + lesson.youtube_url, + Path(tmp_dir), + lang="en", + start_sec=0, + end_sec=lesson.duration_seconds, allow_auto=True, ) @@ -211,13 +221,6 @@ def generate_lesson_artifacts_task(lesson_id: int) -> None: db.commit() return - artifacts = generate_lesson_artifacts_from_transcript( - lesson_id=str(lesson.id), - lesson_title=lesson.title, - youtube_id=lesson.youtube_video_id, - duration_seconds=lesson.duration_seconds, - transcript=transcript, - ) lesson.transcript_status = "ready" lesson.transcript_source = transcript.source lesson.transcript_text = transcript.text @@ -229,13 +232,36 @@ def generate_lesson_artifacts_task(lesson_id: int) -> None: } for segment in transcript.segments ] - lesson.flashcards_json = artifacts.flashcards - lesson.subtitles_json = artifacts.subtitles - lesson.watch_vocab_json = artifacts.watch_vocab - lesson.cultural_notes_json = artifacts.cultural_notes + lesson.subtitles_json = build_subtitle_artifacts( + youtube_id=lesson.youtube_video_id, + duration_seconds=lesson.duration_seconds, + transcript=transcript, + english_transcript=english_transcript, + ) + lesson.watch_vocab_json = {} + lesson.cultural_notes_json = [] + db.commit() + + try: + artifacts = generate_lesson_artifacts_from_transcript( + lesson_id=str(lesson.id), + lesson_title=lesson.title, + youtube_id=lesson.youtube_video_id, + duration_seconds=lesson.duration_seconds, + transcript=transcript, + english_transcript=english_transcript, + ) + lesson.flashcards_json = artifacts.flashcards + lesson.watch_vocab_json = artifacts.watch_vocab + lesson.cultural_notes_json = artifacts.cultural_notes + lesson.error_code = None + lesson.error_message = None + except Exception as exc: + lesson.flashcards_json = None + lesson.error_code = "flashcard_generation_failed" + lesson.error_message = str(exc) + lesson.generation_status = "ready" - lesson.error_code = None - lesson.error_message = None db.commit() except Exception as exc: db.rollback() diff --git a/app/api/public.py b/app/api/public.py index 0f34ca5..001cf70 100644 --- a/app/api/public.py +++ b/app/api/public.py @@ -107,6 +107,14 @@ def get_preview_flashcards( """Return flashcard data for a preview lesson without authentication.""" lesson = _get_ready_preview_lesson(db, lesson_id) if lesson.flashcards_json is None: + if lesson.error_code == "flashcard_generation_failed": + raise HTTPException( + status_code=422, + detail={ + "code": "flashcard_generation_failed", + "message": lesson.error_message or "Flashcard generation failed.", + }, + ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail={ diff --git a/app/services/lesson_artifacts.py b/app/services/lesson_artifacts.py index f7ecba3..82cbf3e 100644 --- a/app/services/lesson_artifacts.py +++ b/app/services/lesson_artifacts.py @@ -3,7 +3,11 @@ import json from app.core.config import get_settings -from app.services.transcripts import TranscriptResult +from app.services.transcripts import TranscriptResult, TranscriptSegment + + +FLASHCARD_TRANSCRIPT_MAX_SECONDS = 300 +FLASHCARD_TRANSCRIPT_MAX_CHARS = 12000 class ArtifactValidationError(ValueError): @@ -24,7 +28,15 @@ def generate_lesson_artifacts_from_transcript( youtube_id: str, duration_seconds: int, transcript: TranscriptResult, + english_transcript: TranscriptResult | None = None, ) -> LessonArtifacts: + subtitles = build_subtitle_artifacts( + youtube_id=youtube_id, + duration_seconds=duration_seconds, + transcript=transcript, + english_transcript=english_transcript, + ) + flashcard_transcript = limit_transcript_for_flashcards(transcript) settings = get_settings() if settings.ai_provider == "gemini" and settings.gemini_api_key: payload = _call_gemini( @@ -32,70 +44,152 @@ def generate_lesson_artifacts_from_transcript( lesson_title=lesson_title, youtube_id=youtube_id, duration_seconds=duration_seconds, - transcript=transcript, + transcript=flashcard_transcript, ) else: - payload = _mock_artifacts( + payload = _mock_flashcards( lesson_id=lesson_id, lesson_title=lesson_title, youtube_id=youtube_id, duration_seconds=duration_seconds, - transcript=transcript, + transcript=flashcard_transcript, ) - return validate_lesson_artifacts(payload) + flashcards = validate_flashcard_artifacts(payload) + return LessonArtifacts( + flashcards=flashcards, + subtitles=subtitles, + watch_vocab={}, + cultural_notes=[], + ) + + +def build_subtitle_artifacts( + youtube_id: str, + duration_seconds: int, + transcript: TranscriptResult, + english_transcript: TranscriptResult | None = None, +) -> dict[str, Any]: + return { + "youtubeId": youtube_id, + "durationSec": duration_seconds, + "lines": [ + { + "id": f"s{index}", + "startSec": segment.start_sec, + "endSec": segment.end_sec, + "korean": segment.text, + "english": _matching_english_text(segment, english_transcript), + } + for index, segment in enumerate(transcript.segments, start=1) + ], + } + + +def _matching_english_text( + korean_segment: TranscriptSegment, + english_transcript: TranscriptResult | None, +) -> str: + if english_transcript is None: + return "" + + matches = [ + segment.text + for segment in english_transcript.segments + if segment.end_sec > korean_segment.start_sec + and segment.start_sec < korean_segment.end_sec + ] + return " ".join(matches) + + +def limit_transcript_for_flashcards( + transcript: TranscriptResult, + max_seconds: int = FLASHCARD_TRANSCRIPT_MAX_SECONDS, + max_chars: int = FLASHCARD_TRANSCRIPT_MAX_CHARS, +) -> TranscriptResult: + segments: list[TranscriptSegment] = [] + used_chars = 0 + + for segment in transcript.segments: + if segment.start_sec >= max_seconds: + break + + remaining_chars = max_chars - used_chars + if remaining_chars <= 0: + break + + text = segment.text[:remaining_chars].rstrip() + if not text: + break + + segments.append( + TranscriptSegment( + start_sec=segment.start_sec, + end_sec=min(segment.end_sec, max_seconds), + text=text, + ) + ) + used_chars += len(text) + + if segment.end_sec >= max_seconds or used_chars >= max_chars: + break + + return TranscriptResult( + source=transcript.source, + text="\n".join(segment.text for segment in segments), + segments=segments, + ) def validate_lesson_artifacts(payload: dict[str, Any]) -> LessonArtifacts: + subtitles = payload.get("subtitles") + if not isinstance(subtitles, dict): + raise ArtifactValidationError("subtitles must be an object") + if not isinstance(subtitles.get("lines"), list): + raise ArtifactValidationError("subtitles.lines must be a list") + + return LessonArtifacts( + flashcards=validate_flashcard_artifacts(payload), + subtitles={ + "youtubeId": subtitles.get("youtubeId"), + "durationSec": subtitles.get("durationSec"), + "lines": subtitles["lines"], + }, + watch_vocab=_validate_watch_vocab(subtitles), + cultural_notes=_validate_cultural_notes(subtitles), + ) + + +def validate_flashcard_artifacts(payload: dict[str, Any]) -> dict[str, Any]: flashcards = payload.get("flashcards") if not isinstance(flashcards, dict): raise ArtifactValidationError("flashcards must be an object") if not isinstance(flashcards.get("cards"), list): raise ArtifactValidationError("flashcards.cards must be a list") + return flashcards - subtitles = payload.get("subtitles") - if not isinstance(subtitles, dict): - raise ArtifactValidationError("subtitles must be an object") - if not isinstance(subtitles.get("lines"), list): - raise ArtifactValidationError("subtitles.lines must be a list") +def _validate_watch_vocab(subtitles: dict[str, Any]) -> dict[str, Any]: watch_vocab = subtitles.get("vocabMap", {}) if not isinstance(watch_vocab, dict): raise ArtifactValidationError("subtitles.vocabMap must be an object") + return watch_vocab + +def _validate_cultural_notes(subtitles: dict[str, Any]) -> list[dict[str, Any]]: cultural_notes = subtitles.get("culturalNotes", []) if not isinstance(cultural_notes, list): raise ArtifactValidationError("subtitles.culturalNotes must be a list") - - return LessonArtifacts( - flashcards=flashcards, - subtitles={ - "youtubeId": subtitles.get("youtubeId"), - "durationSec": subtitles.get("durationSec"), - "lines": subtitles["lines"], - }, - watch_vocab=watch_vocab, - cultural_notes=cultural_notes, - ) + return cultural_notes -def _mock_artifacts( +def _mock_flashcards( lesson_id: str, lesson_title: str, youtube_id: str, duration_seconds: int, transcript: TranscriptResult, ) -> dict[str, Any]: - lines = [ - { - "id": f"s{index}", - "startSec": int(segment.start_sec), - "endSec": int(segment.end_sec), - "korean": segment.text, - "english": f"English translation for: {segment.text}", - } - for index, segment in enumerate(transcript.segments, start=1) - ] first_segment = transcript.segments[0] if transcript.segments else None start_sec = int(first_segment.start_sec) if first_segment else 0 end_sec = int(first_segment.end_sec) if first_segment else min(duration_seconds, 5) @@ -128,30 +222,6 @@ def _mock_artifacts( } ], }, - "subtitles": { - "youtubeId": youtube_id, - "durationSec": duration_seconds, - "lines": lines, - "vocabMap": { - expression: { - "meaning": f"Meaning of {expression}", - "cardId": card_id, - "lessonId": lesson_id, - "expression": expression, - "exampleSentence": first_text, - "exampleTranslation": f"English translation for: {first_text}", - } - }, - "culturalNotes": [ - { - "id": f"culture-{lesson_id}-1", - "subtitleId": "s1", - "title": expression, - "keyword": "Context", - "explanation": "This note is generated from the transcript context.", - } - ], - }, } @@ -265,22 +335,16 @@ def _build_gemini_prompt( Required top-level shape: {{ - "flashcards": {{"lessonId": "{lesson_id}", "lessonTitle": "{lesson_title}", "cards": []}}, - "subtitles": {{ - "youtubeId": "{youtube_id}", - "durationSec": {duration_seconds}, - "lines": [], - "vocabMap": {{}}, - "culturalNotes": [] - }} + "flashcards": {{"lessonId": "{lesson_id}", "lessonTitle": "{lesson_title}", "cards": []}} }} Rules: -- flashcards.cards must contain 5 to 10 cards when the transcript has enough material. +- You are receiving a deliberately shortened transcript excerpt, capped to roughly the first {FLASHCARD_TRANSCRIPT_MAX_SECONDS // 60} minutes and {FLASHCARD_TRANSCRIPT_MAX_CHARS} characters to keep the request small and reliable. +- Create flashcards only from this excerpt. Do not try to cover the whole video. +- flashcards.cards must contain 5 to 10 cards when the excerpt has enough material. - Include BOTH word cards and useful ending cards. - Use ONLY the timestamped transcript segments below for all startSec/endSec values. - For every flashcard video, startSec/endSec MUST match the transcript segment that contains the exampleSentence or scriptSentence. Do not invent timestamps. -- For subtitles.lines, preserve the transcript segment timing exactly unless adjacent segments must be merged for readability. If merging, use the first segment startSec and last segment endSec. - YOU MUST format EACH card EXACTLY according to these structures: Structure for Word card (type="word"): @@ -318,11 +382,7 @@ def _build_gemini_prompt( "relatedVideos": [] }} -- subtitles.lines must contain Korean and English lines with startSec and endSec. Example: {{"id": "s1", "startSec": 0, "endSec": 5, "korean": "...", "english": "..."}} -- vocabMap keys must be surface forms that appear in subtitle Korean text. -- culturalNotes should explain slang, idioms, cultural context, or grammar patterns. - Transcript source: {transcript.source} Timestamped transcript segments: -{timestamped_segments[:18000]} +{timestamped_segments} """.strip() diff --git a/app/services/transcripts.py b/app/services/transcripts.py index cc9f70f..2919f5c 100644 --- a/app/services/transcripts.py +++ b/app/services/transcripts.py @@ -21,6 +21,7 @@ class TranscriptResult: source: Literal["youtube_caption", "youtube_auto_caption"] text: str segments: list[TranscriptSegment] + lang: str | None = None def filter_segments( @@ -43,6 +44,7 @@ def download_youtube_captions( end_sec: int, allow_auto: bool = True, runner: any = None, # Signature compatibility + require_requested_lang: bool = True, ) -> TranscriptResult | None: settings = get_settings() if not settings.supadata_api_key: @@ -64,6 +66,10 @@ def download_youtube_captions( return None data = response.json() + actual_lang = data.get("lang") + if require_requested_lang and actual_lang != lang: + return None + content = data.get("content", []) segments: list[TranscriptSegment] = [] @@ -95,7 +101,8 @@ def download_youtube_captions( return TranscriptResult( source="youtube_caption", text=full_text, - segments=segments + segments=segments, + lang=actual_lang, ) except Exception: diff --git a/tests/test_lesson_artifacts.py b/tests/test_lesson_artifacts.py index 0613ca0..56e6a0a 100644 --- a/tests/test_lesson_artifacts.py +++ b/tests/test_lesson_artifacts.py @@ -1,8 +1,12 @@ import pytest +from app.core.config import get_settings from app.services.lesson_artifacts import ( ArtifactValidationError, + FLASHCARD_TRANSCRIPT_MAX_CHARS, + build_subtitle_artifacts, generate_lesson_artifacts_from_transcript, + limit_transcript_for_flashcards, _parse_gemini_json, validate_lesson_artifacts, ) @@ -33,6 +37,7 @@ def sample_transcript() -> TranscriptResult: def test_generate_lesson_artifacts_returns_frontend_contract(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("AI_PROVIDER", "mock") + get_settings.cache_clear() artifacts = generate_lesson_artifacts_from_transcript( lesson_id="42", @@ -52,8 +57,50 @@ def test_generate_lesson_artifacts_returns_frontend_contract(monkeypatch: pytest } assert artifacts.subtitles["youtubeId"] == "abc123XYZ00" assert artifacts.subtitles["lines"][0]["korean"].startswith("안녕하세요") - assert "안녕하세요" in artifacts.watch_vocab - assert artifacts.cultural_notes[0]["subtitleId"] == "s1" + assert artifacts.subtitles["lines"][0]["english"] == "" + assert artifacts.watch_vocab == {} + assert artifacts.cultural_notes == [] + get_settings.cache_clear() + + +def test_build_subtitle_artifacts_merges_overlapping_english_segments(): + english_transcript = TranscriptResult( + source="youtube_caption", + text="Hello. Today we study Korean.", + lang="en", + segments=[ + TranscriptSegment(start_sec=0, end_sec=2, text="Hello."), + TranscriptSegment(start_sec=2, end_sec=5, text="Today we study Korean."), + ], + ) + + subtitles = build_subtitle_artifacts( + youtube_id="abc123XYZ00", + duration_seconds=10, + transcript=sample_transcript(), + english_transcript=english_transcript, + ) + + assert subtitles["lines"][0]["english"] == "Hello. Today we study Korean." + assert subtitles["lines"][1]["english"] == "" + + +def test_flashcard_transcript_is_limited_to_five_minutes_and_safe_character_count(): + transcript = TranscriptResult( + source="youtube_caption", + text="", + segments=[ + TranscriptSegment(start_sec=0, end_sec=120, text="가" * 6000), + TranscriptSegment(start_sec=120, end_sec=240, text="나" * 6000), + TranscriptSegment(start_sec=240, end_sec=360, text="다" * 6000), + ], + ) + + limited = limit_transcript_for_flashcards(transcript) + + assert limited.segments[-1].end_sec <= 300 + assert len(limited.text) <= FLASHCARD_TRANSCRIPT_MAX_CHARS + len(limited.segments) + assert all(segment.start_sec < 300 for segment in limited.segments) def test_validate_lesson_artifacts_rejects_missing_required_shapes(): diff --git a/tests/test_lessons_api.py b/tests/test_lessons_api.py index 80ebe91..926bb7f 100644 --- a/tests/test_lessons_api.py +++ b/tests/test_lessons_api.py @@ -8,6 +8,7 @@ import app.api.lessons as lessons_api from app.api.auth import get_google_user +from app.core.config import get_settings from app.db.base import Base from app.db.session import enable_sqlite_foreign_keys, get_db from app.main import app @@ -189,6 +190,8 @@ def test_lesson_artifact_endpoints_return_status_specific_errors(client: TestCli def test_background_task_generates_and_stores_artifacts(client: TestClient, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("AI_PROVIDER", "mock") + get_settings.cache_clear() headers = auth_headers(client) db = next(app.dependency_overrides[get_db]()) try: @@ -212,12 +215,24 @@ def test_background_task_generates_and_stores_artifacts(client: TestClient, monk finally: db.close() - monkeypatch.setattr( - lessons_api, - "download_youtube_captions", - lambda *args, **kwargs: TranscriptResult( + def fake_download(*args, **kwargs): + if kwargs["lang"] == "en": + return TranscriptResult( + source="youtube_caption", + text="Hello. Today we study Korean.", + lang="en", + segments=[ + TranscriptSegment( + start_sec=0, + end_sec=5, + text="Hello. Today we study Korean.", + ) + ], + ) + return TranscriptResult( source="youtube_caption", text="안녕하세요. 오늘은 한국어를 공부해요.", + lang="ko", segments=[ TranscriptSegment( start_sec=0, @@ -225,8 +240,9 @@ def test_background_task_generates_and_stores_artifacts(client: TestClient, monk text="안녕하세요. 오늘은 한국어를 공부해요.", ) ], - ), - ) + ) + + monkeypatch.setattr(lessons_api, "download_youtube_captions", fake_download) lessons_api.generate_lesson_artifacts_task(lesson_id) @@ -237,3 +253,80 @@ def test_background_task_generates_and_stores_artifacts(client: TestClient, monk response = client.get(f"/api/lessons/{lesson_id}/subtitles", headers=headers) assert response.status_code == 200 assert response.json()["youtubeId"] == "abc123XYZ00" + assert response.json()["lines"][0]["english"] == "Hello. Today we study Korean." + get_settings.cache_clear() + + +def test_background_task_keeps_watch_ready_when_flashcard_generation_fails( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +): + headers = auth_headers(client) + db = next(app.dependency_overrides[get_db]()) + try: + user_id = db.scalar(select(User.id).where(User.email == "lessons@example.com")) + lesson = Lesson( + user_id=user_id, + youtube_url="https://youtu.be/abc123XYZ00", + youtube_video_id="abc123XYZ00", + title="Subtitle Only Lesson", + channel_title="Channel", + thumbnail_url=None, + duration_seconds=900, + generation_status="generating", + transcript_status="pending", + raw_youtube_metadata={}, + ) + db.add(lesson) + db.commit() + db.refresh(lesson) + lesson_id = lesson.id + finally: + db.close() + + requested_ranges: list[tuple[int, int]] = [] + + def fake_download(*args, **kwargs): + requested_ranges.append((kwargs["start_sec"], kwargs["end_sec"])) + if kwargs["lang"] == "en": + return None + return TranscriptResult( + source="youtube_caption", + text="안녕하세요. 오늘은 한국어를 공부해요.", + segments=[ + TranscriptSegment( + start_sec=0, + end_sec=5, + text="안녕하세요. 오늘은 한국어를 공부해요.", + ) + ], + ) + + monkeypatch.setattr(lessons_api, "download_youtube_captions", fake_download) + monkeypatch.setattr( + lessons_api, + "generate_lesson_artifacts_from_transcript", + lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("Gemini unavailable")), + ) + + lessons_api.generate_lesson_artifacts_task(lesson_id) + + assert requested_ranges == [(0, 900), (0, 900)] + + response = client.get(f"/api/lessons/{lesson_id}", headers=headers) + assert response.status_code == 200 + assert response.json()["generationStatus"] == "ready" + assert response.json()["subtitleDone"] is True + assert response.json()["flashcardDone"] is False + assert response.json()["errorCode"] == "flashcard_generation_failed" + + response = client.get(f"/api/lessons/{lesson_id}/subtitles", headers=headers) + assert response.status_code == 200 + body = response.json() + assert body["youtubeId"] == "abc123XYZ00" + assert body["lines"][0]["korean"].startswith("안녕하세요") + assert body["lines"][0]["english"] == "" + + response = client.get(f"/api/lessons/{lesson_id}/flashcards", headers=headers) + assert response.status_code == 422 + assert response.json()["detail"]["code"] == "flashcard_generation_failed" diff --git a/tests/test_transcripts.py b/tests/test_transcripts.py index 10ec1d4..c30c0b5 100644 --- a/tests/test_transcripts.py +++ b/tests/test_transcripts.py @@ -50,3 +50,38 @@ def test_download_youtube_captions_success(mock_get_settings, mock_get, tmp_path assert len(transcript.segments) == 2 assert transcript.segments[0].start_sec == 0.0 assert transcript.segments[1].end_sec == 10.0 + assert transcript.lang == "ko" + + +@patch("app.services.transcripts.httpx.get") +@patch("app.services.transcripts.get_settings") +def test_download_youtube_captions_rejects_supadata_language_fallback( + mock_get_settings, + mock_get, + tmp_path: Path, +): + mock_settings = MagicMock() + mock_settings.supadata_api_key = "test-key" + mock_get_settings.return_value = mock_settings + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "content": [ + {"text": "안녕하세요.", "offset": 0, "duration": 5000, "lang": "ko"}, + ], + "lang": "ko", + "availableLangs": ["ko"], + } + mock_get.return_value = mock_response + + transcript = download_youtube_captions( + "https://youtu.be/abc123XYZ00", + tmp_path, + lang="en", + start_sec=0, + end_sec=10, + allow_auto=True, + ) + + assert transcript is None From 83080f7e1812531adf08d1c27795a9085931578d Mon Sep 17 00:00:00 2001 From: anxi01 Date: Wed, 13 May 2026 00:57:37 +0900 Subject: [PATCH 08/13] feat: enrich watch mode during flashcard generation --- app/api/lessons.py | 3 + app/api/public.py | 1 + app/services/lesson_artifacts.py | 142 +++++++++++++++++++++++++++++-- tests/test_lesson_artifacts.py | 34 +++++++- tests/test_lessons_api.py | 40 +++++++++ tests/test_public_api.py | 9 ++ 6 files changed, 219 insertions(+), 10 deletions(-) diff --git a/app/api/lessons.py b/app/api/lessons.py index 08b1f8e..6a8120b 100644 --- a/app/api/lessons.py +++ b/app/api/lessons.py @@ -28,6 +28,7 @@ parse_iso8601_duration_seconds, parse_published_at, select_thumbnail_url, + validate_video_item, ) router = APIRouter(prefix="/lessons", tags=["lessons"]) @@ -75,6 +76,7 @@ def create_lesson( ) from exc item = fetch_youtube_video_item(youtube_video_id) + validate_video_item(item) snippet = item["snippet"] duration_seconds = parse_iso8601_duration_seconds(item["contentDetails"]["duration"]) lesson = Lesson( @@ -161,6 +163,7 @@ def get_lesson_subtitles( ) return { **lesson.subtitles_json, + "youtubeId": lesson.subtitles_json.get("youtubeId") or lesson.youtube_video_id, "vocabMap": lesson.watch_vocab_json or {}, "culturalNotes": lesson.cultural_notes_json or [], } diff --git a/app/api/public.py b/app/api/public.py index 001cf70..447b1d5 100644 --- a/app/api/public.py +++ b/app/api/public.py @@ -147,6 +147,7 @@ def get_preview_subtitles( ) return { **lesson.subtitles_json, + "youtubeId": lesson.subtitles_json.get("youtubeId") or lesson.youtube_video_id, "vocabMap": lesson.watch_vocab_json or {}, "culturalNotes": lesson.cultural_notes_json or [], } diff --git a/app/services/lesson_artifacts.py b/app/services/lesson_artifacts.py index 82cbf3e..cc308d0 100644 --- a/app/services/lesson_artifacts.py +++ b/app/services/lesson_artifacts.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from typing import Any import json +import random from app.core.config import get_settings from app.services.transcripts import TranscriptResult, TranscriptSegment @@ -36,7 +37,10 @@ def generate_lesson_artifacts_from_transcript( transcript=transcript, english_transcript=english_transcript, ) - flashcard_transcript = limit_transcript_for_flashcards(transcript) + flashcard_transcript = sample_transcript_for_flashcards( + transcript=transcript, + seed=f"{lesson_id}:{youtube_id}", + ) settings = get_settings() if settings.ai_provider == "gemini" and settings.gemini_api_key: payload = _call_gemini( @@ -56,14 +60,21 @@ def generate_lesson_artifacts_from_transcript( ) flashcards = validate_flashcard_artifacts(payload) + watch_enrichments = validate_watch_enrichments(payload) return LessonArtifacts( flashcards=flashcards, subtitles=subtitles, - watch_vocab={}, - cultural_notes=[], + watch_vocab=watch_enrichments.watch_vocab, + cultural_notes=watch_enrichments.cultural_notes, ) +@dataclass(frozen=True) +class WatchEnrichments: + watch_vocab: dict[str, Any] + cultural_notes: list[dict[str, Any]] + + def build_subtitle_artifacts( youtube_id: str, duration_seconds: int, @@ -141,6 +152,73 @@ def limit_transcript_for_flashcards( ) +def sample_transcript_for_flashcards( + transcript: TranscriptResult, + seed: str, + max_seconds: int = FLASHCARD_TRANSCRIPT_MAX_SECONDS, + max_chars: int = FLASHCARD_TRANSCRIPT_MAX_CHARS, +) -> TranscriptResult: + if not transcript.segments: + return TranscriptResult(source=transcript.source, text="", segments=[], lang=transcript.lang) + + total_start = transcript.segments[0].start_sec + total_end = max(segment.end_sec for segment in transcript.segments) + if total_end - total_start <= max_seconds: + return limit_transcript_for_flashcards(transcript, max_seconds=max_seconds, max_chars=max_chars) + + rng = random.Random(seed) + window_count = 3 if total_end - total_start > 900 else 2 + window_seconds = max_seconds / window_count + bucket_seconds = (total_end - total_start) / window_count + starts = [] + for index in range(window_count): + bucket_start = total_start + (index * bucket_seconds) + bucket_end = total_start + ((index + 1) * bucket_seconds) + latest_start = max(bucket_start, bucket_end - window_seconds) + starts.append(rng.uniform(bucket_start, latest_start)) + + selected: list[TranscriptSegment] = [] + used_chars = 0 + selected_keys: set[tuple[float, float, str]] = set() + + for start in starts: + end = min(start + window_seconds, total_end) + for segment in transcript.segments: + if segment.end_sec <= start or segment.start_sec >= end: + continue + remaining_chars = max_chars - used_chars + if remaining_chars <= 0: + break + + text = segment.text[:remaining_chars].rstrip() + if not text: + break + + clipped = TranscriptSegment( + start_sec=max(segment.start_sec, start), + end_sec=min(segment.end_sec, end), + text=text, + ) + key = (clipped.start_sec, clipped.end_sec, clipped.text) + if key in selected_keys: + continue + + selected.append(clipped) + selected_keys.add(key) + used_chars += len(text) + + if used_chars >= max_chars: + break + + selected.sort(key=lambda segment: (segment.start_sec, segment.end_sec)) + return TranscriptResult( + source=transcript.source, + text="\n".join(segment.text for segment in selected), + segments=selected, + lang=transcript.lang, + ) + + def validate_lesson_artifacts(payload: dict[str, Any]) -> LessonArtifacts: subtitles = payload.get("subtitles") if not isinstance(subtitles, dict): @@ -169,6 +247,27 @@ def validate_flashcard_artifacts(payload: dict[str, Any]) -> dict[str, Any]: return flashcards +def validate_watch_enrichments(payload: dict[str, Any]) -> WatchEnrichments: + watch = payload.get("watch", {}) + if watch is None: + watch = {} + if not isinstance(watch, dict): + raise ArtifactValidationError("watch must be an object") + + watch_vocab = watch.get("vocabMap", {}) + if not isinstance(watch_vocab, dict): + raise ArtifactValidationError("watch.vocabMap must be an object") + + cultural_notes = watch.get("culturalNotes", []) + if not isinstance(cultural_notes, list): + raise ArtifactValidationError("watch.culturalNotes must be a list") + + return WatchEnrichments( + watch_vocab=watch_vocab, + cultural_notes=cultural_notes, + ) + + def _validate_watch_vocab(subtitles: dict[str, Any]) -> dict[str, Any]: watch_vocab = subtitles.get("vocabMap", {}) if not isinstance(watch_vocab, dict): @@ -222,6 +321,27 @@ def _mock_flashcards( } ], }, + "watch": { + "vocabMap": { + expression: { + "meaning": f"Meaning of {expression}", + "cardId": card_id, + "lessonId": lesson_id, + "expression": expression, + "exampleSentence": first_text, + "exampleTranslation": f"English translation for: {first_text}", + } + }, + "culturalNotes": [ + { + "id": f"culture-{lesson_id}-1", + "subtitleId": "s1", + "title": expression, + "keyword": "Context", + "explanation": "This note is generated from the transcript context.", + } + ], + }, } @@ -325,8 +445,8 @@ def _build_gemini_prompt( transcript: TranscriptResult, ) -> str: timestamped_segments = "\n".join( - f"[{int(segment.start_sec)}-{int(segment.end_sec)}] {segment.text}" - for segment in transcript.segments + f"[s{index} {int(segment.start_sec)}-{int(segment.end_sec)}] {segment.text}" + for index, segment in enumerate(transcript.segments, start=1) ) return f""" @@ -335,16 +455,22 @@ def _build_gemini_prompt( Required top-level shape: {{ - "flashcards": {{"lessonId": "{lesson_id}", "lessonTitle": "{lesson_title}", "cards": []}} + "flashcards": {{"lessonId": "{lesson_id}", "lessonTitle": "{lesson_title}", "cards": []}}, + "watch": {{"vocabMap": {{}}, "culturalNotes": []}} }} Rules: -- You are receiving a deliberately shortened transcript excerpt, capped to roughly the first {FLASHCARD_TRANSCRIPT_MAX_SECONDS // 60} minutes and {FLASHCARD_TRANSCRIPT_MAX_CHARS} characters to keep the request small and reliable. -- Create flashcards only from this excerpt. Do not try to cover the whole video. +- You are receiving a deterministic sample of continuous transcript excerpts, capped to roughly {FLASHCARD_TRANSCRIPT_MAX_SECONDS // 60} total minutes and {FLASHCARD_TRANSCRIPT_MAX_CHARS} characters to keep the request small and reliable. +- Create flashcards and watch enrichments only from these sampled excerpts. Do not try to cover the whole video. - flashcards.cards must contain 5 to 10 cards when the excerpt has enough material. - Include BOTH word cards and useful ending cards. - Use ONLY the timestamped transcript segments below for all startSec/endSec values. - For every flashcard video, startSec/endSec MUST match the transcript segment that contains the exampleSentence or scriptSentence. Do not invent timestamps. +- watch.vocabMap powers hidden vocabulary labels in the Watch UI. Keys MUST be Korean surface forms that appear verbatim in the sampled transcript. +- When a watch.vocabMap entry corresponds to a flashcard, set cardId to that flashcard id. +- watch.vocabMap values MUST include meaning, lessonId, expression, exampleSentence, and exampleTranslation. +- watch.culturalNotes should contain 2 to 5 notes for slang, idioms, cultural context, or grammar patterns found in the sampled excerpts. +- watch.culturalNotes subtitleId MUST reference one of the sampled subtitle ids shown below, such as s1, s2, s3. - YOU MUST format EACH card EXACTLY according to these structures: Structure for Word card (type="word"): diff --git a/tests/test_lesson_artifacts.py b/tests/test_lesson_artifacts.py index 56e6a0a..64dba8d 100644 --- a/tests/test_lesson_artifacts.py +++ b/tests/test_lesson_artifacts.py @@ -7,8 +7,10 @@ build_subtitle_artifacts, generate_lesson_artifacts_from_transcript, limit_transcript_for_flashcards, + sample_transcript_for_flashcards, _parse_gemini_json, validate_lesson_artifacts, + validate_watch_enrichments, ) from app.services.transcripts import TranscriptResult, TranscriptSegment @@ -58,8 +60,9 @@ def test_generate_lesson_artifacts_returns_frontend_contract(monkeypatch: pytest assert artifacts.subtitles["youtubeId"] == "abc123XYZ00" assert artifacts.subtitles["lines"][0]["korean"].startswith("안녕하세요") assert artifacts.subtitles["lines"][0]["english"] == "" - assert artifacts.watch_vocab == {} - assert artifacts.cultural_notes == [] + assert "안녕하세요" in artifacts.watch_vocab + assert artifacts.watch_vocab["안녕하세요"]["cardId"] == "fc-42-1" + assert artifacts.cultural_notes[0]["subtitleId"] == "s1" get_settings.cache_clear() @@ -103,6 +106,25 @@ def test_flashcard_transcript_is_limited_to_five_minutes_and_safe_character_coun assert all(segment.start_sec < 300 for segment in limited.segments) +def test_flashcard_transcript_sampling_is_deterministic_and_not_always_the_start(): + transcript = TranscriptResult( + source="youtube_caption", + text="", + segments=[ + TranscriptSegment(start_sec=i * 30, end_sec=(i + 1) * 30, text=f"구간{i}") + for i in range(40) + ], + ) + + first = sample_transcript_for_flashcards(transcript, seed="lesson:abc") + second = sample_transcript_for_flashcards(transcript, seed="lesson:abc") + + assert first.segments == second.segments + assert first.segments[0].start_sec > 0 + covered_seconds = sum(segment.end_sec - segment.start_sec for segment in first.segments) + assert covered_seconds <= 300 + + def test_validate_lesson_artifacts_rejects_missing_required_shapes(): with pytest.raises(ArtifactValidationError, match="flashcards.cards"): validate_lesson_artifacts( @@ -121,6 +143,14 @@ def test_validate_lesson_artifacts_rejects_missing_required_shapes(): ) +def test_validate_watch_enrichments_rejects_bad_shapes(): + with pytest.raises(ArtifactValidationError, match="watch.vocabMap"): + validate_watch_enrichments({"watch": {"vocabMap": []}}) + + with pytest.raises(ArtifactValidationError, match="watch.culturalNotes"): + validate_watch_enrichments({"watch": {"culturalNotes": {}}}) + + def test_parse_gemini_json_repairs_trailing_commas(): payload = """ { diff --git a/tests/test_lessons_api.py b/tests/test_lessons_api.py index 926bb7f..bdeed8c 100644 --- a/tests/test_lessons_api.py +++ b/tests/test_lessons_api.py @@ -140,6 +140,44 @@ def test_ready_lesson_flashcards_and_subtitles_are_returned(client: TestClient): assert response.json()["youtubeId"] == "ready123" +def test_lesson_subtitles_fall_back_to_lesson_youtube_id_when_artifact_omits_it( + client: TestClient, +): + headers = auth_headers(client) + db = next(app.dependency_overrides[get_db]()) + try: + user_id = db.scalar(select(User.id).where(User.email == "lessons@example.com")) + lesson = Lesson( + user_id=user_id, + youtube_url="https://youtu.be/fallback123", + youtube_video_id="fallback123", + title="Fallback Video Lesson", + channel_title="Channel", + thumbnail_url=None, + duration_seconds=60, + generation_status="ready", + transcript_status="ready", + transcript_source="youtube_caption", + transcript_text="안녕하세요.", + flashcards_json={"lessonId": "1", "lessonTitle": "Fallback Video Lesson", "cards": []}, + subtitles_json={"durationSec": 60, "lines": []}, + watch_vocab_json={}, + cultural_notes_json=[], + raw_youtube_metadata={}, + ) + db.add(lesson) + db.commit() + db.refresh(lesson) + lesson_id = lesson.id + finally: + db.close() + + response = client.get(f"/api/lessons/{lesson_id}/subtitles", headers=headers) + + assert response.status_code == 200 + assert response.json()["youtubeId"] == "fallback123" + + def test_lesson_artifact_endpoints_return_status_specific_errors(client: TestClient): headers = auth_headers(client) db = next(app.dependency_overrides[get_db]()) @@ -254,6 +292,8 @@ def fake_download(*args, **kwargs): assert response.status_code == 200 assert response.json()["youtubeId"] == "abc123XYZ00" assert response.json()["lines"][0]["english"] == "Hello. Today we study Korean." + assert "안녕하세요" in response.json()["vocabMap"] + assert response.json()["culturalNotes"][0]["subtitleId"] == "s1" get_settings.cache_clear() diff --git a/tests/test_public_api.py b/tests/test_public_api.py index 7dea7a5..8fc3541 100644 --- a/tests/test_public_api.py +++ b/tests/test_public_api.py @@ -278,6 +278,15 @@ def test_get_preview_subtitles_schema(): assert isinstance(data["culturalNotes"], list) +def test_get_preview_subtitles_falls_back_to_lesson_youtube_id(): + with TestingSessionLocal() as db: + lesson = _make_lesson(db, subtitles_json={"durationSec": 60, "lines": []}) + + data = client.get(f"{BASE}/lessons/{lesson.id}/subtitles").json() + + assert data["youtubeId"] == lesson.youtube_video_id + + def test_get_preview_subtitles_lines_shape(): with TestingSessionLocal() as db: lesson = _make_lesson(db) From 3c1f1fdd42de44f27d918f3099bab715a75e546c Mon Sep 17 00:00:00 2001 From: anxi01 Date: Wed, 13 May 2026 01:06:39 +0900 Subject: [PATCH 09/13] fix: reduce Gemini artifact request size --- app/services/lesson_artifacts.py | 20 ++++++++++---------- tests/test_lesson_artifacts.py | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/app/services/lesson_artifacts.py b/app/services/lesson_artifacts.py index cc308d0..10dc1f4 100644 --- a/app/services/lesson_artifacts.py +++ b/app/services/lesson_artifacts.py @@ -7,8 +7,8 @@ from app.services.transcripts import TranscriptResult, TranscriptSegment -FLASHCARD_TRANSCRIPT_MAX_SECONDS = 300 -FLASHCARD_TRANSCRIPT_MAX_CHARS = 12000 +FLASHCARD_TRANSCRIPT_MAX_SECONDS = 180 +FLASHCARD_TRANSCRIPT_MAX_CHARS = 6000 class ArtifactValidationError(ValueError): @@ -167,7 +167,7 @@ def sample_transcript_for_flashcards( return limit_transcript_for_flashcards(transcript, max_seconds=max_seconds, max_chars=max_chars) rng = random.Random(seed) - window_count = 3 if total_end - total_start > 900 else 2 + window_count = 2 window_seconds = max_seconds / window_count bucket_seconds = (total_end - total_start) / window_count starts = [] @@ -462,18 +462,18 @@ def _build_gemini_prompt( Rules: - You are receiving a deterministic sample of continuous transcript excerpts, capped to roughly {FLASHCARD_TRANSCRIPT_MAX_SECONDS // 60} total minutes and {FLASHCARD_TRANSCRIPT_MAX_CHARS} characters to keep the request small and reliable. - Create flashcards and watch enrichments only from these sampled excerpts. Do not try to cover the whole video. -- flashcards.cards must contain 5 to 10 cards when the excerpt has enough material. -- Include BOTH word cards and useful ending cards. +- flashcards.cards must contain 3 to 5 cards when the excerpt has enough material. +- Prefer word cards. Include at most 1 ending card only if a useful grammar pattern is obvious. - Use ONLY the timestamped transcript segments below for all startSec/endSec values. - For every flashcard video, startSec/endSec MUST match the transcript segment that contains the exampleSentence or scriptSentence. Do not invent timestamps. - watch.vocabMap powers hidden vocabulary labels in the Watch UI. Keys MUST be Korean surface forms that appear verbatim in the sampled transcript. -- When a watch.vocabMap entry corresponds to a flashcard, set cardId to that flashcard id. +- watch.vocabMap must contain at most 5 entries. When an entry corresponds to a flashcard, set cardId to that flashcard id. - watch.vocabMap values MUST include meaning, lessonId, expression, exampleSentence, and exampleTranslation. -- watch.culturalNotes should contain 2 to 5 notes for slang, idioms, cultural context, or grammar patterns found in the sampled excerpts. +- watch.culturalNotes should contain 0 to 2 notes for slang, idioms, cultural context, or grammar patterns found in the sampled excerpts. - watch.culturalNotes subtitleId MUST reference one of the sampled subtitle ids shown below, such as s1, s2, s3. -- YOU MUST format EACH card EXACTLY according to these structures: +- Use these compact card shapes: -Structure for Word card (type="word"): +Word card: {{ "id": "fc-{lesson_id}-word-1", "type": "word", @@ -489,7 +489,7 @@ def _build_gemini_prompt( ] }} -Structure for Ending card (type="ending"): +Ending card: {{ "id": "fc-{lesson_id}-ending-1", "type": "ending", diff --git a/tests/test_lesson_artifacts.py b/tests/test_lesson_artifacts.py index 64dba8d..c1a7a72 100644 --- a/tests/test_lesson_artifacts.py +++ b/tests/test_lesson_artifacts.py @@ -88,7 +88,7 @@ def test_build_subtitle_artifacts_merges_overlapping_english_segments(): assert subtitles["lines"][1]["english"] == "" -def test_flashcard_transcript_is_limited_to_five_minutes_and_safe_character_count(): +def test_flashcard_transcript_is_limited_to_safe_duration_and_character_count(): transcript = TranscriptResult( source="youtube_caption", text="", @@ -101,9 +101,9 @@ def test_flashcard_transcript_is_limited_to_five_minutes_and_safe_character_coun limited = limit_transcript_for_flashcards(transcript) - assert limited.segments[-1].end_sec <= 300 + assert limited.segments[-1].end_sec <= 180 assert len(limited.text) <= FLASHCARD_TRANSCRIPT_MAX_CHARS + len(limited.segments) - assert all(segment.start_sec < 300 for segment in limited.segments) + assert all(segment.start_sec < 180 for segment in limited.segments) def test_flashcard_transcript_sampling_is_deterministic_and_not_always_the_start(): @@ -122,7 +122,7 @@ def test_flashcard_transcript_sampling_is_deterministic_and_not_always_the_start assert first.segments == second.segments assert first.segments[0].start_sec > 0 covered_seconds = sum(segment.end_sec - segment.start_sec for segment in first.segments) - assert covered_seconds <= 300 + assert covered_seconds <= 180 def test_validate_lesson_artifacts_rejects_missing_required_shapes(): From 0f483f22481ee50ae35881ab2bb166393aaf2ed6 Mon Sep 17 00:00:00 2001 From: anxi01 Date: Wed, 13 May 2026 20:42:00 +0900 Subject: [PATCH 10/13] ci: disable EC2 deploy workflow on PRs --- .github/workflows/deploy-ec2.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/deploy-ec2.yml b/.github/workflows/deploy-ec2.yml index af585e4..a422216 100644 --- a/.github/workflows/deploy-ec2.yml +++ b/.github/workflows/deploy-ec2.yml @@ -1,7 +1,6 @@ name: Deploy to EC2 on: - pull_request: push: branches: - main @@ -36,7 +35,7 @@ jobs: name: Deploy runs-on: ubuntu-latest needs: test - if: github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' || github.ref == 'refs/heads/main' + if: github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' environment: production steps: From 07e0b82299d89d18ecca467b9e38902e4e660e4d Mon Sep 17 00:00:00 2001 From: anxi01 Date: Wed, 13 May 2026 21:26:46 +0900 Subject: [PATCH 11/13] ci: keep PR tests without deployment --- .github/workflows/deploy-ec2.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/deploy-ec2.yml b/.github/workflows/deploy-ec2.yml index a422216..f18f50a 100644 --- a/.github/workflows/deploy-ec2.yml +++ b/.github/workflows/deploy-ec2.yml @@ -1,6 +1,7 @@ name: Deploy to EC2 on: + pull_request: push: branches: - main From 61d2b12e1f6756d678b5fb43fc204f8b21feb17a Mon Sep 17 00:00:00 2001 From: anxi01 Date: Wed, 13 May 2026 21:32:39 +0900 Subject: [PATCH 12/13] fix: fallback when transcript sampling is empty --- app/services/lesson_artifacts.py | 8 ++++++++ tests/test_lesson_artifacts.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/app/services/lesson_artifacts.py b/app/services/lesson_artifacts.py index 10dc1f4..efb3ad1 100644 --- a/app/services/lesson_artifacts.py +++ b/app/services/lesson_artifacts.py @@ -149,6 +149,7 @@ def limit_transcript_for_flashcards( source=transcript.source, text="\n".join(segment.text for segment in segments), segments=segments, + lang=transcript.lang, ) @@ -210,6 +211,13 @@ def sample_transcript_for_flashcards( if used_chars >= max_chars: break + if not selected: + return limit_transcript_for_flashcards( + transcript, + max_seconds=max_seconds, + max_chars=max_chars, + ) + selected.sort(key=lambda segment: (segment.start_sec, segment.end_sec)) return TranscriptResult( source=transcript.source, diff --git a/tests/test_lesson_artifacts.py b/tests/test_lesson_artifacts.py index c1a7a72..495625e 100644 --- a/tests/test_lesson_artifacts.py +++ b/tests/test_lesson_artifacts.py @@ -125,6 +125,26 @@ def test_flashcard_transcript_sampling_is_deterministic_and_not_always_the_start assert covered_seconds <= 180 +def test_flashcard_transcript_sampling_falls_back_when_windows_are_empty(): + transcript = TranscriptResult( + source="youtube_caption", + text="", + segments=[ + TranscriptSegment(start_sec=0, end_sec=10, text="첫 실제 발화"), + TranscriptSegment(start_sec=1000, end_sec=1010, text="마지막 실제 발화"), + ], + lang="ko", + ) + + sampled = sample_transcript_for_flashcards(transcript, seed="lesson:abc") + + assert sampled.segments == [ + TranscriptSegment(start_sec=0, end_sec=10, text="첫 실제 발화") + ] + assert sampled.text == "첫 실제 발화" + assert sampled.lang == "ko" + + def test_validate_lesson_artifacts_rejects_missing_required_shapes(): with pytest.raises(ArtifactValidationError, match="flashcards.cards"): validate_lesson_artifacts( From 5029e04e414d8baa5ceaf24dd357bba467b1444b Mon Sep 17 00:00:00 2001 From: anxi01 Date: Wed, 13 May 2026 21:34:57 +0900 Subject: [PATCH 13/13] fix: accept regional transcript language codes --- app/services/transcripts.py | 13 +++++++++++- tests/test_transcripts.py | 40 +++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/app/services/transcripts.py b/app/services/transcripts.py index 2919f5c..3849bdb 100644 --- a/app/services/transcripts.py +++ b/app/services/transcripts.py @@ -36,6 +36,17 @@ def filter_segments( ] +def language_matches(requested_lang: str, actual_lang: str | None) -> bool: + if not actual_lang: + return False + + return _base_language_code(requested_lang) == _base_language_code(actual_lang) + + +def _base_language_code(lang: str) -> str: + return lang.strip().lower().replace("_", "-").split("-", maxsplit=1)[0] + + def download_youtube_captions( url: str, output_dir: Path, # Signature compatibility @@ -67,7 +78,7 @@ def download_youtube_captions( data = response.json() actual_lang = data.get("lang") - if require_requested_lang and actual_lang != lang: + if require_requested_lang and not language_matches(lang, actual_lang): return None content = data.get("content", []) diff --git a/tests/test_transcripts.py b/tests/test_transcripts.py index c30c0b5..29a1d0d 100644 --- a/tests/test_transcripts.py +++ b/tests/test_transcripts.py @@ -53,6 +53,46 @@ def test_download_youtube_captions_success(mock_get_settings, mock_get, tmp_path assert transcript.lang == "ko" +@patch("app.services.transcripts.httpx.get") +@patch("app.services.transcripts.get_settings") +def test_download_youtube_captions_accepts_regional_language_variant( + mock_get_settings, + mock_get, + tmp_path: Path, +): + mock_settings = MagicMock() + mock_settings.supadata_api_key = "test-key" + mock_get_settings.return_value = mock_settings + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "content": [ + { + "text": "안녕하세요. 오늘은 한국어 자막을 공부합니다.", + "offset": 0, + "duration": 5000, + "lang": "ko-KR", + }, + ], + "lang": "ko-KR", + } + mock_get.return_value = mock_response + + transcript = download_youtube_captions( + "https://youtu.be/abc123XYZ00", + tmp_path, + lang="ko", + start_sec=0, + end_sec=10, + allow_auto=True, + ) + + assert transcript is not None + assert transcript.lang == "ko-KR" + assert "한국어 자막" in transcript.text + + @patch("app.services.transcripts.httpx.get") @patch("app.services.transcripts.get_settings") def test_download_youtube_captions_rejects_supadata_language_fallback(