diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..6598ca0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +.git +.github +.pytest_cache +.venv +__pycache__ +*.py[cod] +*.egg-info +.env +linko-dev.db +dist +build diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..365cc45 --- /dev/null +++ b/.env.example @@ -0,0 +1,20 @@ +DATABASE_URL= +POSTGRES_DB= +POSTGRES_USER= +POSTGRES_PASSWORD= +API_PORT= + +JWT_SECRET_KEY= +JWT_ALGORITHM= +JWT_ACCESS_TOKEN_EXPIRE_MINUTES= + +GOOGLE_CLIENT_ID= +GOOGLE_CLIENT_SECRET= +YOUTUBE_API_KEY= + +AI_PROVIDER= +GEMINI_API_KEY= +GEMINI_MODEL= +SUPADATA_API_KEY= + +CORS_ORIGINS= \ No newline at end of file diff --git a/.github/workflows/deploy-ec2.yml b/.github/workflows/deploy-ec2.yml new file mode 100644 index 0000000..f18f50a --- /dev/null +++ b/.github/workflows/deploy-ec2.yml @@ -0,0 +1,110 @@ +name: Deploy to EC2 + +on: + pull_request: + push: + branches: + - main + workflow_dispatch: + +concurrency: + group: deploy-ec2-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + name: Test + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + cache: pip + + - name: Install dependencies + run: python -m pip install -e ".[dev]" + + - name: Run tests + run: python -m pytest -v + + deploy: + name: Deploy + runs-on: ubuntu-latest + needs: test + if: github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' + environment: production + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Configure SSH + env: + EC2_HOST: ${{ secrets.EC2_HOST }} + EC2_PORT: ${{ secrets.EC2_PORT }} + EC2_SSH_KEY: ${{ secrets.EC2_SSH_KEY }} + EC2_USER: ${{ secrets.EC2_USER }} + EC2_HOST_KEY: ${{ secrets.EC2_HOST_KEY }} + run: | + test -n "$EC2_HOST" + test -n "$EC2_SSH_KEY" + test -n "$EC2_HOST_KEY" + mkdir -p ~/.ssh + printf '%s\n' "$EC2_SSH_KEY" > ~/.ssh/linko-ec2 + chmod 600 ~/.ssh/linko-ec2 + printf '%s\n' "$EC2_HOST_KEY" >> ~/.ssh/known_hosts + { + echo "Host linko-ec2" + echo " HostName $EC2_HOST" + echo " Port ${EC2_PORT:-22}" + echo " User ${EC2_USER:-ubuntu}" + echo " IdentityFile ~/.ssh/linko-ec2" + echo " StrictHostKeyChecking yes" + } >> ~/.ssh/config + + - name: Package application + run: | + tar \ + --exclude='.git' \ + --exclude='.github' \ + --exclude='.pytest_cache' \ + --exclude='.venv' \ + --exclude='__pycache__' \ + --exclude='*.py[cod]' \ + --exclude='*.egg-info' \ + --exclude='.env' \ + -czf /tmp/linko-server.tar.gz . + + - name: Upload application + env: + DEPLOY_PATH: ${{ secrets.EC2_DEPLOY_PATH }} + PROD_ENV: ${{ secrets.PROD_ENV }} + run: | + ssh linko-ec2 "mkdir -p '${DEPLOY_PATH:-/opt/linko-server}'" + scp /tmp/linko-server.tar.gz "linko-ec2:${DEPLOY_PATH:-/opt/linko-server}/release.tar.gz" + if [ -n "$PROD_ENV" ]; then + printf '%s\n' "$PROD_ENV" > /tmp/linko.env + scp /tmp/linko.env "linko-ec2:${DEPLOY_PATH:-/opt/linko-server}/.env" + fi + + - name: Restart services + env: + DEPLOY_PATH: ${{ secrets.EC2_DEPLOY_PATH }} + run: | + ssh linko-ec2 "DEPLOY_PATH='${DEPLOY_PATH:-/opt/linko-server}' bash -s" <<'REMOTE' + set -euo pipefail + cd "$DEPLOY_PATH" + tar -xzf release.tar.gz + rm release.tar.gz + + test -f .env + docker compose -f docker-compose.prod.yml up -d --build + docker compose -f docker-compose.prod.yml exec -T api python -m alembic upgrade head + docker compose -f docker-compose.prod.yml ps + docker image prune -f + REMOTE diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..bdb22f3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.13-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y --no-install-recommends gcc libpq-dev \ + && rm -rf /var/lib/apt/lists/* + +COPY pyproject.toml ./ +COPY alembic.ini ./ +COPY alembic ./alembic +COPY app ./app + +RUN python -m pip install --upgrade pip \ + && python -m pip install . + +EXPOSE 8000 + +CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index 4fca7cb..b5bde54 100644 --- a/README.md +++ b/README.md @@ -66,3 +66,22 @@ Stop local services: ```sh docker compose down ``` + +## EC2 Deployment + +The production deployment uses GitHub Actions to copy the app to an EC2 instance +and restart Docker Compose there. Configure these GitHub environment secrets for +the `production` environment: + +- `EC2_HOST`: EC2 public host or IP address. +- `EC2_USER`: SSH user, for example `ubuntu`. +- `EC2_SSH_KEY`: private key with SSH access to the instance. +- `EC2_PORT`: optional SSH port, defaults to `22`. +- `EC2_DEPLOY_PATH`: optional deploy directory, defaults to `/opt/linko-server`. +- `PROD_ENV`: full contents of the production `.env` file. Use `.env.example` as + the template, and replace secrets before deploying. + +On the EC2 instance, install Docker and the Docker Compose plugin first. Then run +the `Deploy to EC2` workflow manually, or merge to `main` to deploy +automatically. The workflow builds the FastAPI image on EC2, starts the API and +PostgreSQL containers, and runs Alembic migrations. diff --git a/app/api/flashcards.py b/app/api/flashcards.py index 75b51d8..9feeb8c 100644 --- a/app/api/flashcards.py +++ b/app/api/flashcards.py @@ -38,6 +38,14 @@ def get_flashcards_for_lesson( ) if lesson.flashcards_json is not None: return lesson.flashcards_json + if lesson.error_code == "flashcard_generation_failed": + raise HTTPException( + status_code=422, + detail={ + "code": "flashcard_generation_failed", + "message": lesson.error_message or "Flashcard generation failed.", + }, + ) flashcards = get_lesson_flashcards(lesson_id) if flashcards is None: diff --git a/app/api/lessons.py b/app/api/lessons.py index ac1b777..6a8120b 100644 --- a/app/api/lessons.py +++ b/app/api/lessons.py @@ -16,7 +16,10 @@ LessonStatusResponse, LessonSummary, ) -from app.services.lesson_artifacts import generate_lesson_artifacts_from_transcript +from app.services.lesson_artifacts import ( + build_subtitle_artifacts, + generate_lesson_artifacts_from_transcript, +) from app.services.transcripts import download_youtube_captions from app.services.youtube import ( extract_video_id, @@ -25,6 +28,7 @@ parse_iso8601_duration_seconds, parse_published_at, select_thumbnail_url, + validate_video_item, ) router = APIRouter(prefix="/lessons", tags=["lessons"]) @@ -39,8 +43,8 @@ def _lesson_summary(lesson: Lesson) -> LessonSummary: duration=format_duration(lesson.duration_seconds), date=lesson.created_at.strftime("%Y.%m.%d") if lesson.created_at else None, generationStatus=lesson.generation_status, - flashcardDone=False, - subtitleDone=False, + flashcardDone=lesson.flashcards_json is not None, + subtitleDone=lesson.subtitles_json is not None, errorCode=lesson.error_code, errorMessage=lesson.error_message, ) @@ -72,6 +76,7 @@ def create_lesson( ) from exc item = fetch_youtube_video_item(youtube_video_id) + validate_video_item(item) snippet = item["snippet"] duration_seconds = parse_iso8601_duration_seconds(item["contentDetails"]["duration"]) lesson = Lesson( @@ -158,6 +163,7 @@ def get_lesson_subtitles( ) return { **lesson.subtitles_json, + "youtubeId": lesson.subtitles_json.get("youtubeId") or lesson.youtube_video_id, "vocabMap": lesson.watch_vocab_json or {}, "culturalNotes": lesson.cultural_notes_json or [], } @@ -190,14 +196,21 @@ def generate_lesson_artifacts_task(lesson_id: int) -> None: if lesson is None: return - end_sec = min(lesson.duration_seconds, 600) with TemporaryDirectory() as tmp_dir: transcript = download_youtube_captions( lesson.youtube_url, Path(tmp_dir), lang="ko", start_sec=0, - end_sec=end_sec, + end_sec=lesson.duration_seconds, + allow_auto=True, + ) + english_transcript = download_youtube_captions( + lesson.youtube_url, + Path(tmp_dir), + lang="en", + start_sec=0, + end_sec=lesson.duration_seconds, allow_auto=True, ) @@ -211,13 +224,6 @@ def generate_lesson_artifacts_task(lesson_id: int) -> None: db.commit() return - artifacts = generate_lesson_artifacts_from_transcript( - lesson_id=str(lesson.id), - lesson_title=lesson.title, - youtube_id=lesson.youtube_video_id, - duration_seconds=lesson.duration_seconds, - transcript=transcript, - ) lesson.transcript_status = "ready" lesson.transcript_source = transcript.source lesson.transcript_text = transcript.text @@ -229,13 +235,36 @@ def generate_lesson_artifacts_task(lesson_id: int) -> None: } for segment in transcript.segments ] - lesson.flashcards_json = artifacts.flashcards - lesson.subtitles_json = artifacts.subtitles - lesson.watch_vocab_json = artifacts.watch_vocab - lesson.cultural_notes_json = artifacts.cultural_notes + lesson.subtitles_json = build_subtitle_artifacts( + youtube_id=lesson.youtube_video_id, + duration_seconds=lesson.duration_seconds, + transcript=transcript, + english_transcript=english_transcript, + ) + lesson.watch_vocab_json = {} + lesson.cultural_notes_json = [] + db.commit() + + try: + artifacts = generate_lesson_artifacts_from_transcript( + lesson_id=str(lesson.id), + lesson_title=lesson.title, + youtube_id=lesson.youtube_video_id, + duration_seconds=lesson.duration_seconds, + transcript=transcript, + english_transcript=english_transcript, + ) + lesson.flashcards_json = artifacts.flashcards + lesson.watch_vocab_json = artifacts.watch_vocab + lesson.cultural_notes_json = artifacts.cultural_notes + lesson.error_code = None + lesson.error_message = None + except Exception as exc: + lesson.flashcards_json = None + lesson.error_code = "flashcard_generation_failed" + lesson.error_message = str(exc) + lesson.generation_status = "ready" - lesson.error_code = None - lesson.error_message = None db.commit() except Exception as exc: db.rollback() diff --git a/app/api/public.py b/app/api/public.py index 0f34ca5..447b1d5 100644 --- a/app/api/public.py +++ b/app/api/public.py @@ -107,6 +107,14 @@ def get_preview_flashcards( """Return flashcard data for a preview lesson without authentication.""" lesson = _get_ready_preview_lesson(db, lesson_id) if lesson.flashcards_json is None: + if lesson.error_code == "flashcard_generation_failed": + raise HTTPException( + status_code=422, + detail={ + "code": "flashcard_generation_failed", + "message": lesson.error_message or "Flashcard generation failed.", + }, + ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail={ @@ -139,6 +147,7 @@ def get_preview_subtitles( ) return { **lesson.subtitles_json, + "youtubeId": lesson.subtitles_json.get("youtubeId") or lesson.youtube_video_id, "vocabMap": lesson.watch_vocab_json or {}, "culturalNotes": lesson.cultural_notes_json or [], } diff --git a/app/core/config.py b/app/core/config.py index 29a0462..7412aa0 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -14,6 +14,7 @@ class Settings(BaseSettings): ai_provider: str = "mock" gemini_api_key: str = "" gemini_model: str = "gemini-2.5-flash" + supadata_api_key: str = "" cors_origins: str = ( "http://localhost:3000," "http://127.0.0.1:3000," @@ -21,7 +22,7 @@ class Settings(BaseSettings): "http://127.0.0.1:3001" ) - model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8") + model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore") @property def cors_origin_list(self) -> list[str]: diff --git a/app/services/lesson_artifacts.py b/app/services/lesson_artifacts.py index 5ab3b65..efb3ad1 100644 --- a/app/services/lesson_artifacts.py +++ b/app/services/lesson_artifacts.py @@ -1,9 +1,14 @@ from dataclasses import dataclass from typing import Any import json +import random from app.core.config import get_settings -from app.services.transcripts import TranscriptResult +from app.services.transcripts import TranscriptResult, TranscriptSegment + + +FLASHCARD_TRANSCRIPT_MAX_SECONDS = 180 +FLASHCARD_TRANSCRIPT_MAX_CHARS = 6000 class ArtifactValidationError(ValueError): @@ -24,7 +29,18 @@ def generate_lesson_artifacts_from_transcript( youtube_id: str, duration_seconds: int, transcript: TranscriptResult, + english_transcript: TranscriptResult | None = None, ) -> LessonArtifacts: + subtitles = build_subtitle_artifacts( + youtube_id=youtube_id, + duration_seconds=duration_seconds, + transcript=transcript, + english_transcript=english_transcript, + ) + flashcard_transcript = sample_transcript_for_flashcards( + transcript=transcript, + seed=f"{lesson_id}:{youtube_id}", + ) settings = get_settings() if settings.ai_provider == "gemini" and settings.gemini_api_key: payload = _call_gemini( @@ -32,70 +48,255 @@ def generate_lesson_artifacts_from_transcript( lesson_title=lesson_title, youtube_id=youtube_id, duration_seconds=duration_seconds, - transcript=transcript, + transcript=flashcard_transcript, ) else: - payload = _mock_artifacts( + payload = _mock_flashcards( lesson_id=lesson_id, lesson_title=lesson_title, youtube_id=youtube_id, duration_seconds=duration_seconds, - transcript=transcript, + transcript=flashcard_transcript, ) - return validate_lesson_artifacts(payload) + flashcards = validate_flashcard_artifacts(payload) + watch_enrichments = validate_watch_enrichments(payload) + return LessonArtifacts( + flashcards=flashcards, + subtitles=subtitles, + watch_vocab=watch_enrichments.watch_vocab, + cultural_notes=watch_enrichments.cultural_notes, + ) -def validate_lesson_artifacts(payload: dict[str, Any]) -> LessonArtifacts: - flashcards = payload.get("flashcards") - if not isinstance(flashcards, dict): - raise ArtifactValidationError("flashcards must be an object") - if not isinstance(flashcards.get("cards"), list): - raise ArtifactValidationError("flashcards.cards must be a list") +@dataclass(frozen=True) +class WatchEnrichments: + watch_vocab: dict[str, Any] + cultural_notes: list[dict[str, Any]] + + +def build_subtitle_artifacts( + youtube_id: str, + duration_seconds: int, + transcript: TranscriptResult, + english_transcript: TranscriptResult | None = None, +) -> dict[str, Any]: + return { + "youtubeId": youtube_id, + "durationSec": duration_seconds, + "lines": [ + { + "id": f"s{index}", + "startSec": segment.start_sec, + "endSec": segment.end_sec, + "korean": segment.text, + "english": _matching_english_text(segment, english_transcript), + } + for index, segment in enumerate(transcript.segments, start=1) + ], + } + + +def _matching_english_text( + korean_segment: TranscriptSegment, + english_transcript: TranscriptResult | None, +) -> str: + if english_transcript is None: + return "" + + matches = [ + segment.text + for segment in english_transcript.segments + if segment.end_sec > korean_segment.start_sec + and segment.start_sec < korean_segment.end_sec + ] + return " ".join(matches) + + +def limit_transcript_for_flashcards( + transcript: TranscriptResult, + max_seconds: int = FLASHCARD_TRANSCRIPT_MAX_SECONDS, + max_chars: int = FLASHCARD_TRANSCRIPT_MAX_CHARS, +) -> TranscriptResult: + segments: list[TranscriptSegment] = [] + used_chars = 0 + + for segment in transcript.segments: + if segment.start_sec >= max_seconds: + break + + remaining_chars = max_chars - used_chars + if remaining_chars <= 0: + break + + text = segment.text[:remaining_chars].rstrip() + if not text: + break + + segments.append( + TranscriptSegment( + start_sec=segment.start_sec, + end_sec=min(segment.end_sec, max_seconds), + text=text, + ) + ) + used_chars += len(text) + + if segment.end_sec >= max_seconds or used_chars >= max_chars: + break + + return TranscriptResult( + source=transcript.source, + text="\n".join(segment.text for segment in segments), + segments=segments, + lang=transcript.lang, + ) + + +def sample_transcript_for_flashcards( + transcript: TranscriptResult, + seed: str, + max_seconds: int = FLASHCARD_TRANSCRIPT_MAX_SECONDS, + max_chars: int = FLASHCARD_TRANSCRIPT_MAX_CHARS, +) -> TranscriptResult: + if not transcript.segments: + return TranscriptResult(source=transcript.source, text="", segments=[], lang=transcript.lang) + + total_start = transcript.segments[0].start_sec + total_end = max(segment.end_sec for segment in transcript.segments) + if total_end - total_start <= max_seconds: + return limit_transcript_for_flashcards(transcript, max_seconds=max_seconds, max_chars=max_chars) + + rng = random.Random(seed) + window_count = 2 + window_seconds = max_seconds / window_count + bucket_seconds = (total_end - total_start) / window_count + starts = [] + for index in range(window_count): + bucket_start = total_start + (index * bucket_seconds) + bucket_end = total_start + ((index + 1) * bucket_seconds) + latest_start = max(bucket_start, bucket_end - window_seconds) + starts.append(rng.uniform(bucket_start, latest_start)) + + selected: list[TranscriptSegment] = [] + used_chars = 0 + selected_keys: set[tuple[float, float, str]] = set() + + for start in starts: + end = min(start + window_seconds, total_end) + for segment in transcript.segments: + if segment.end_sec <= start or segment.start_sec >= end: + continue + remaining_chars = max_chars - used_chars + if remaining_chars <= 0: + break + + text = segment.text[:remaining_chars].rstrip() + if not text: + break + + clipped = TranscriptSegment( + start_sec=max(segment.start_sec, start), + end_sec=min(segment.end_sec, end), + text=text, + ) + key = (clipped.start_sec, clipped.end_sec, clipped.text) + if key in selected_keys: + continue + + selected.append(clipped) + selected_keys.add(key) + used_chars += len(text) + + if used_chars >= max_chars: + break + if not selected: + return limit_transcript_for_flashcards( + transcript, + max_seconds=max_seconds, + max_chars=max_chars, + ) + + selected.sort(key=lambda segment: (segment.start_sec, segment.end_sec)) + return TranscriptResult( + source=transcript.source, + text="\n".join(segment.text for segment in selected), + segments=selected, + lang=transcript.lang, + ) + + +def validate_lesson_artifacts(payload: dict[str, Any]) -> LessonArtifacts: subtitles = payload.get("subtitles") if not isinstance(subtitles, dict): raise ArtifactValidationError("subtitles must be an object") if not isinstance(subtitles.get("lines"), list): raise ArtifactValidationError("subtitles.lines must be a list") - watch_vocab = subtitles.get("vocabMap", {}) - if not isinstance(watch_vocab, dict): - raise ArtifactValidationError("subtitles.vocabMap must be an object") - - cultural_notes = subtitles.get("culturalNotes", []) - if not isinstance(cultural_notes, list): - raise ArtifactValidationError("subtitles.culturalNotes must be a list") - return LessonArtifacts( - flashcards=flashcards, + flashcards=validate_flashcard_artifacts(payload), subtitles={ "youtubeId": subtitles.get("youtubeId"), "durationSec": subtitles.get("durationSec"), "lines": subtitles["lines"], }, + watch_vocab=_validate_watch_vocab(subtitles), + cultural_notes=_validate_cultural_notes(subtitles), + ) + + +def validate_flashcard_artifacts(payload: dict[str, Any]) -> dict[str, Any]: + flashcards = payload.get("flashcards") + if not isinstance(flashcards, dict): + raise ArtifactValidationError("flashcards must be an object") + if not isinstance(flashcards.get("cards"), list): + raise ArtifactValidationError("flashcards.cards must be a list") + return flashcards + + +def validate_watch_enrichments(payload: dict[str, Any]) -> WatchEnrichments: + watch = payload.get("watch", {}) + if watch is None: + watch = {} + if not isinstance(watch, dict): + raise ArtifactValidationError("watch must be an object") + + watch_vocab = watch.get("vocabMap", {}) + if not isinstance(watch_vocab, dict): + raise ArtifactValidationError("watch.vocabMap must be an object") + + cultural_notes = watch.get("culturalNotes", []) + if not isinstance(cultural_notes, list): + raise ArtifactValidationError("watch.culturalNotes must be a list") + + return WatchEnrichments( watch_vocab=watch_vocab, cultural_notes=cultural_notes, ) -def _mock_artifacts( +def _validate_watch_vocab(subtitles: dict[str, Any]) -> dict[str, Any]: + watch_vocab = subtitles.get("vocabMap", {}) + if not isinstance(watch_vocab, dict): + raise ArtifactValidationError("subtitles.vocabMap must be an object") + return watch_vocab + + +def _validate_cultural_notes(subtitles: dict[str, Any]) -> list[dict[str, Any]]: + cultural_notes = subtitles.get("culturalNotes", []) + if not isinstance(cultural_notes, list): + raise ArtifactValidationError("subtitles.culturalNotes must be a list") + return cultural_notes + + +def _mock_flashcards( lesson_id: str, lesson_title: str, youtube_id: str, duration_seconds: int, transcript: TranscriptResult, ) -> dict[str, Any]: - lines = [ - { - "id": f"s{index}", - "startSec": int(segment.start_sec), - "endSec": int(segment.end_sec), - "korean": segment.text, - "english": f"English translation for: {segment.text}", - } - for index, segment in enumerate(transcript.segments, start=1) - ] first_segment = transcript.segments[0] if transcript.segments else None start_sec = int(first_segment.start_sec) if first_segment else 0 end_sec = int(first_segment.end_sec) if first_segment else min(duration_seconds, 5) @@ -128,10 +329,7 @@ def _mock_artifacts( } ], }, - "subtitles": { - "youtubeId": youtube_id, - "durationSec": duration_seconds, - "lines": lines, + "watch": { "vocabMap": { expression: { "meaning": f"Meaning of {expression}", @@ -189,7 +387,7 @@ def _call_gemini( "contents": [{"role": "user", "parts": [{"text": prompt}]}], "generationConfig": {"responseMimeType": "application/json"}, }, - timeout=300, + timeout=600, ) response.raise_for_status() data = response.json() @@ -255,8 +453,8 @@ def _build_gemini_prompt( transcript: TranscriptResult, ) -> str: timestamped_segments = "\n".join( - f"[{int(segment.start_sec)}-{int(segment.end_sec)}] {segment.text}" - for segment in transcript.segments + f"[s{index} {int(segment.start_sec)}-{int(segment.end_sec)}] {segment.text}" + for index, segment in enumerate(transcript.segments, start=1) ) return f""" @@ -266,24 +464,24 @@ def _build_gemini_prompt( Required top-level shape: {{ "flashcards": {{"lessonId": "{lesson_id}", "lessonTitle": "{lesson_title}", "cards": []}}, - "subtitles": {{ - "youtubeId": "{youtube_id}", - "durationSec": {duration_seconds}, - "lines": [], - "vocabMap": {{}}, - "culturalNotes": [] - }} + "watch": {{"vocabMap": {{}}, "culturalNotes": []}} }} Rules: -- flashcards.cards must contain 5 to 10 cards when the transcript has enough material. -- Include BOTH word cards and useful ending cards. +- You are receiving a deterministic sample of continuous transcript excerpts, capped to roughly {FLASHCARD_TRANSCRIPT_MAX_SECONDS // 60} total minutes and {FLASHCARD_TRANSCRIPT_MAX_CHARS} characters to keep the request small and reliable. +- Create flashcards and watch enrichments only from these sampled excerpts. Do not try to cover the whole video. +- flashcards.cards must contain 3 to 5 cards when the excerpt has enough material. +- Prefer word cards. Include at most 1 ending card only if a useful grammar pattern is obvious. - Use ONLY the timestamped transcript segments below for all startSec/endSec values. - For every flashcard video, startSec/endSec MUST match the transcript segment that contains the exampleSentence or scriptSentence. Do not invent timestamps. -- For subtitles.lines, preserve the transcript segment timing exactly unless adjacent segments must be merged for readability. If merging, use the first segment startSec and last segment endSec. -- YOU MUST format EACH card EXACTLY according to these structures: - -Structure for Word card (type="word"): +- watch.vocabMap powers hidden vocabulary labels in the Watch UI. Keys MUST be Korean surface forms that appear verbatim in the sampled transcript. +- watch.vocabMap must contain at most 5 entries. When an entry corresponds to a flashcard, set cardId to that flashcard id. +- watch.vocabMap values MUST include meaning, lessonId, expression, exampleSentence, and exampleTranslation. +- watch.culturalNotes should contain 0 to 2 notes for slang, idioms, cultural context, or grammar patterns found in the sampled excerpts. +- watch.culturalNotes subtitleId MUST reference one of the sampled subtitle ids shown below, such as s1, s2, s3. +- Use these compact card shapes: + +Word card: {{ "id": "fc-{lesson_id}-word-1", "type": "word", @@ -299,7 +497,7 @@ def _build_gemini_prompt( ] }} -Structure for Ending card (type="ending"): +Ending card: {{ "id": "fc-{lesson_id}-ending-1", "type": "ending", @@ -318,11 +516,7 @@ def _build_gemini_prompt( "relatedVideos": [] }} -- subtitles.lines must contain Korean and English lines with startSec and endSec. Example: {{"id": "s1", "startSec": 0, "endSec": 5, "korean": "...", "english": "..."}} -- vocabMap keys must be surface forms that appear in subtitle Korean text. -- culturalNotes should explain slang, idioms, cultural context, or grammar patterns. - Transcript source: {transcript.source} Timestamped transcript segments: -{timestamped_segments[:18000]} +{timestamped_segments} """.strip() diff --git a/app/services/transcripts.py b/app/services/transcripts.py index b0d0df3..3849bdb 100644 --- a/app/services/transcripts.py +++ b/app/services/transcripts.py @@ -1,9 +1,12 @@ from dataclasses import dataclass from pathlib import Path -from typing import Callable, Literal +from typing import Literal import html -import re -import subprocess + +import httpx + +from app.core.config import get_settings +from app.services.youtube import extract_video_id @dataclass(frozen=True) @@ -18,68 +21,7 @@ class TranscriptResult: source: Literal["youtube_caption", "youtube_auto_caption"] text: str segments: list[TranscriptSegment] - - -CommandRunner = Callable[[list[str]], subprocess.CompletedProcess[str]] - -TIMESTAMP_RE = re.compile( - r"(?P\d{2}):(?P\d{2}):(?P\d{2})[.,](?P\d{3})" -) - - -def run_command(args: list[str]) -> subprocess.CompletedProcess[str]: - return subprocess.run(args, capture_output=True, text=True, check=False) - - -def timestamp_to_seconds(value: str) -> float: - match = TIMESTAMP_RE.search(value) - if match is None: - raise ValueError(f"Invalid timestamp: {value}") - return ( - int(match.group("h")) * 3600 - + int(match.group("m")) * 60 - + int(match.group("s")) - + int(match.group("ms")) / 1000 - ) - - -def clean_caption_text(value: str) -> str: - value = re.sub(r"<[^>]+>", "", value) - value = html.unescape(value) - return re.sub(r"\s+", " ", value).strip() - - -def parse_vtt(path: Path) -> list[TranscriptSegment]: - segments: list[TranscriptSegment] = [] - current_range: tuple[float, float] | None = None - text_lines: list[str] = [] - - def flush() -> None: - nonlocal current_range, text_lines - if current_range and text_lines: - text = clean_caption_text(" ".join(text_lines)) - if text: - segments.append(TranscriptSegment(current_range[0], current_range[1], text)) - current_range = None - text_lines = [] - - for raw_line in path.read_text(encoding="utf-8", errors="replace").splitlines(): - line = raw_line.strip() - if not line: - flush() - continue - if "-->" in line: - flush() - start_raw, end_raw = [part.strip() for part in line.split("-->", 1)] - end_raw = end_raw.split(" ", 1)[0] - current_range = (timestamp_to_seconds(start_raw), timestamp_to_seconds(end_raw)) - continue - if line == "WEBVTT" or line.startswith(("Kind:", "Language:", "NOTE")): - continue - if current_range: - text_lines.append(line) - flush() - return segments + lang: str | None = None def filter_segments( @@ -94,50 +36,85 @@ def filter_segments( ] +def language_matches(requested_lang: str, actual_lang: str | None) -> bool: + if not actual_lang: + return False + + return _base_language_code(requested_lang) == _base_language_code(actual_lang) + + +def _base_language_code(lang: str) -> str: + return lang.strip().lower().replace("_", "-").split("-", maxsplit=1)[0] + + def download_youtube_captions( url: str, - output_dir: Path, + output_dir: Path, # Signature compatibility lang: str, start_sec: int, end_sec: int, allow_auto: bool = True, - runner: CommandRunner = run_command, + runner: any = None, # Signature compatibility + require_requested_lang: bool = True, ) -> TranscriptResult | None: - output_dir.mkdir(parents=True, exist_ok=True) - output_template = str(output_dir / "captions.%(ext)s") - args = [ - "yt-dlp", - "--skip-download", - "--sub-lang", - lang, - "--write-sub", - "--sub-format", - "vtt", - "-o", - output_template, - url, - ] - if allow_auto: - args.insert(4, "--write-auto-sub") - - result = runner(args) - if result.returncode != 0: + settings = get_settings() + if not settings.supadata_api_key: return None - vtt_files = sorted(output_dir.glob("captions*.vtt")) - if not vtt_files: + try: + video_id = extract_video_id(url) + except Exception: return None - all_segments = parse_vtt(vtt_files[0]) - scoped = filter_segments(all_segments, start_sec=start_sec, end_sec=end_sec) - text = "\n".join(item.text for item in scoped) - if len(text.strip()) < 20: + try: + response = httpx.get( + "https://api.supadata.ai/v1/youtube/transcript", + params={"videoId": video_id, "lang": lang}, + headers={"x-api-key": settings.supadata_api_key}, + timeout=30 + ) + if response.status_code != 200: + return None + + data = response.json() + actual_lang = data.get("lang") + if require_requested_lang and not language_matches(lang, actual_lang): + return None + + content = data.get("content", []) + + segments: list[TranscriptSegment] = [] + for item in content: + # offset and duration are in ms + start = item["offset"] / 1000.0 + duration = item["duration"] / 1000.0 + end = start + duration + + # Filter by time range + if end > start_sec and start < end_sec: + segments.append( + TranscriptSegment( + start_sec=start, + end_sec=end, + text=html.unescape(item["text"]), + ) + ) + + if not segments: + return None + + full_text = "\n".join(s.text for s in segments) + if len(full_text.strip()) < 20: + return None + + # Supadata defaults to the best available transcript. + # We'll label it as youtube_caption for consistency. + return TranscriptResult( + source="youtube_caption", + text=full_text, + segments=segments, + lang=actual_lang, + ) + + except Exception: return None - - source: Literal["youtube_caption", "youtube_auto_caption"] = ( - "youtube_caption" if ".auto." not in vtt_files[0].name else "youtube_auto_caption" - ) - if allow_auto: - source = "youtube_auto_caption" - - return TranscriptResult(source=source, text=text, segments=scoped) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 0000000..f679621 --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,49 @@ +services: + api: + build: + context: . + image: linko-server-api:latest + container_name: linko-api + restart: unless-stopped + env_file: + - .env + environment: + DATABASE_URL: ${DATABASE_URL:?Set DATABASE_URL in .env} + ports: + - "${API_PORT:-8000}:8000" + depends_on: + postgres: + condition: service_healthy + healthcheck: + test: + [ + "CMD-SHELL", + "python -c \"import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/api/health', timeout=5)\"", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + + postgres: + image: postgres:16-alpine + container_name: linko-postgres + restart: unless-stopped + env_file: + - .env + environment: + POSTGRES_DB: ${POSTGRES_DB:-linko} + POSTGRES_USER: ${POSTGRES_USER:-linko} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?Set POSTGRES_PASSWORD in .env} + ports: + - "127.0.0.1:${POSTGRES_HOST_PORT:-5432}:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-linko} -d ${POSTGRES_DB:-linko}"] + interval: 10s + timeout: 5s + retries: 5 + +volumes: + postgres_data: diff --git a/tests/test_lesson_artifacts.py b/tests/test_lesson_artifacts.py index 0613ca0..495625e 100644 --- a/tests/test_lesson_artifacts.py +++ b/tests/test_lesson_artifacts.py @@ -1,10 +1,16 @@ import pytest +from app.core.config import get_settings from app.services.lesson_artifacts import ( ArtifactValidationError, + FLASHCARD_TRANSCRIPT_MAX_CHARS, + build_subtitle_artifacts, generate_lesson_artifacts_from_transcript, + limit_transcript_for_flashcards, + sample_transcript_for_flashcards, _parse_gemini_json, validate_lesson_artifacts, + validate_watch_enrichments, ) from app.services.transcripts import TranscriptResult, TranscriptSegment @@ -33,6 +39,7 @@ def sample_transcript() -> TranscriptResult: def test_generate_lesson_artifacts_returns_frontend_contract(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("AI_PROVIDER", "mock") + get_settings.cache_clear() artifacts = generate_lesson_artifacts_from_transcript( lesson_id="42", @@ -52,8 +59,90 @@ def test_generate_lesson_artifacts_returns_frontend_contract(monkeypatch: pytest } assert artifacts.subtitles["youtubeId"] == "abc123XYZ00" assert artifacts.subtitles["lines"][0]["korean"].startswith("안녕하세요") + assert artifacts.subtitles["lines"][0]["english"] == "" assert "안녕하세요" in artifacts.watch_vocab + assert artifacts.watch_vocab["안녕하세요"]["cardId"] == "fc-42-1" assert artifacts.cultural_notes[0]["subtitleId"] == "s1" + get_settings.cache_clear() + + +def test_build_subtitle_artifacts_merges_overlapping_english_segments(): + english_transcript = TranscriptResult( + source="youtube_caption", + text="Hello. Today we study Korean.", + lang="en", + segments=[ + TranscriptSegment(start_sec=0, end_sec=2, text="Hello."), + TranscriptSegment(start_sec=2, end_sec=5, text="Today we study Korean."), + ], + ) + + subtitles = build_subtitle_artifacts( + youtube_id="abc123XYZ00", + duration_seconds=10, + transcript=sample_transcript(), + english_transcript=english_transcript, + ) + + assert subtitles["lines"][0]["english"] == "Hello. Today we study Korean." + assert subtitles["lines"][1]["english"] == "" + + +def test_flashcard_transcript_is_limited_to_safe_duration_and_character_count(): + transcript = TranscriptResult( + source="youtube_caption", + text="", + segments=[ + TranscriptSegment(start_sec=0, end_sec=120, text="가" * 6000), + TranscriptSegment(start_sec=120, end_sec=240, text="나" * 6000), + TranscriptSegment(start_sec=240, end_sec=360, text="다" * 6000), + ], + ) + + limited = limit_transcript_for_flashcards(transcript) + + assert limited.segments[-1].end_sec <= 180 + assert len(limited.text) <= FLASHCARD_TRANSCRIPT_MAX_CHARS + len(limited.segments) + assert all(segment.start_sec < 180 for segment in limited.segments) + + +def test_flashcard_transcript_sampling_is_deterministic_and_not_always_the_start(): + transcript = TranscriptResult( + source="youtube_caption", + text="", + segments=[ + TranscriptSegment(start_sec=i * 30, end_sec=(i + 1) * 30, text=f"구간{i}") + for i in range(40) + ], + ) + + first = sample_transcript_for_flashcards(transcript, seed="lesson:abc") + second = sample_transcript_for_flashcards(transcript, seed="lesson:abc") + + assert first.segments == second.segments + assert first.segments[0].start_sec > 0 + covered_seconds = sum(segment.end_sec - segment.start_sec for segment in first.segments) + assert covered_seconds <= 180 + + +def test_flashcard_transcript_sampling_falls_back_when_windows_are_empty(): + transcript = TranscriptResult( + source="youtube_caption", + text="", + segments=[ + TranscriptSegment(start_sec=0, end_sec=10, text="첫 실제 발화"), + TranscriptSegment(start_sec=1000, end_sec=1010, text="마지막 실제 발화"), + ], + lang="ko", + ) + + sampled = sample_transcript_for_flashcards(transcript, seed="lesson:abc") + + assert sampled.segments == [ + TranscriptSegment(start_sec=0, end_sec=10, text="첫 실제 발화") + ] + assert sampled.text == "첫 실제 발화" + assert sampled.lang == "ko" def test_validate_lesson_artifacts_rejects_missing_required_shapes(): @@ -74,6 +163,14 @@ def test_validate_lesson_artifacts_rejects_missing_required_shapes(): ) +def test_validate_watch_enrichments_rejects_bad_shapes(): + with pytest.raises(ArtifactValidationError, match="watch.vocabMap"): + validate_watch_enrichments({"watch": {"vocabMap": []}}) + + with pytest.raises(ArtifactValidationError, match="watch.culturalNotes"): + validate_watch_enrichments({"watch": {"culturalNotes": {}}}) + + def test_parse_gemini_json_repairs_trailing_commas(): payload = """ { diff --git a/tests/test_lessons_api.py b/tests/test_lessons_api.py index 80ebe91..bdeed8c 100644 --- a/tests/test_lessons_api.py +++ b/tests/test_lessons_api.py @@ -8,6 +8,7 @@ import app.api.lessons as lessons_api from app.api.auth import get_google_user +from app.core.config import get_settings from app.db.base import Base from app.db.session import enable_sqlite_foreign_keys, get_db from app.main import app @@ -139,6 +140,44 @@ def test_ready_lesson_flashcards_and_subtitles_are_returned(client: TestClient): assert response.json()["youtubeId"] == "ready123" +def test_lesson_subtitles_fall_back_to_lesson_youtube_id_when_artifact_omits_it( + client: TestClient, +): + headers = auth_headers(client) + db = next(app.dependency_overrides[get_db]()) + try: + user_id = db.scalar(select(User.id).where(User.email == "lessons@example.com")) + lesson = Lesson( + user_id=user_id, + youtube_url="https://youtu.be/fallback123", + youtube_video_id="fallback123", + title="Fallback Video Lesson", + channel_title="Channel", + thumbnail_url=None, + duration_seconds=60, + generation_status="ready", + transcript_status="ready", + transcript_source="youtube_caption", + transcript_text="안녕하세요.", + flashcards_json={"lessonId": "1", "lessonTitle": "Fallback Video Lesson", "cards": []}, + subtitles_json={"durationSec": 60, "lines": []}, + watch_vocab_json={}, + cultural_notes_json=[], + raw_youtube_metadata={}, + ) + db.add(lesson) + db.commit() + db.refresh(lesson) + lesson_id = lesson.id + finally: + db.close() + + response = client.get(f"/api/lessons/{lesson_id}/subtitles", headers=headers) + + assert response.status_code == 200 + assert response.json()["youtubeId"] == "fallback123" + + def test_lesson_artifact_endpoints_return_status_specific_errors(client: TestClient): headers = auth_headers(client) db = next(app.dependency_overrides[get_db]()) @@ -189,6 +228,8 @@ def test_lesson_artifact_endpoints_return_status_specific_errors(client: TestCli def test_background_task_generates_and_stores_artifacts(client: TestClient, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("AI_PROVIDER", "mock") + get_settings.cache_clear() headers = auth_headers(client) db = next(app.dependency_overrides[get_db]()) try: @@ -212,12 +253,24 @@ def test_background_task_generates_and_stores_artifacts(client: TestClient, monk finally: db.close() - monkeypatch.setattr( - lessons_api, - "download_youtube_captions", - lambda *args, **kwargs: TranscriptResult( + def fake_download(*args, **kwargs): + if kwargs["lang"] == "en": + return TranscriptResult( + source="youtube_caption", + text="Hello. Today we study Korean.", + lang="en", + segments=[ + TranscriptSegment( + start_sec=0, + end_sec=5, + text="Hello. Today we study Korean.", + ) + ], + ) + return TranscriptResult( source="youtube_caption", text="안녕하세요. 오늘은 한국어를 공부해요.", + lang="ko", segments=[ TranscriptSegment( start_sec=0, @@ -225,8 +278,9 @@ def test_background_task_generates_and_stores_artifacts(client: TestClient, monk text="안녕하세요. 오늘은 한국어를 공부해요.", ) ], - ), - ) + ) + + monkeypatch.setattr(lessons_api, "download_youtube_captions", fake_download) lessons_api.generate_lesson_artifacts_task(lesson_id) @@ -237,3 +291,82 @@ def test_background_task_generates_and_stores_artifacts(client: TestClient, monk response = client.get(f"/api/lessons/{lesson_id}/subtitles", headers=headers) assert response.status_code == 200 assert response.json()["youtubeId"] == "abc123XYZ00" + assert response.json()["lines"][0]["english"] == "Hello. Today we study Korean." + assert "안녕하세요" in response.json()["vocabMap"] + assert response.json()["culturalNotes"][0]["subtitleId"] == "s1" + get_settings.cache_clear() + + +def test_background_task_keeps_watch_ready_when_flashcard_generation_fails( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +): + headers = auth_headers(client) + db = next(app.dependency_overrides[get_db]()) + try: + user_id = db.scalar(select(User.id).where(User.email == "lessons@example.com")) + lesson = Lesson( + user_id=user_id, + youtube_url="https://youtu.be/abc123XYZ00", + youtube_video_id="abc123XYZ00", + title="Subtitle Only Lesson", + channel_title="Channel", + thumbnail_url=None, + duration_seconds=900, + generation_status="generating", + transcript_status="pending", + raw_youtube_metadata={}, + ) + db.add(lesson) + db.commit() + db.refresh(lesson) + lesson_id = lesson.id + finally: + db.close() + + requested_ranges: list[tuple[int, int]] = [] + + def fake_download(*args, **kwargs): + requested_ranges.append((kwargs["start_sec"], kwargs["end_sec"])) + if kwargs["lang"] == "en": + return None + return TranscriptResult( + source="youtube_caption", + text="안녕하세요. 오늘은 한국어를 공부해요.", + segments=[ + TranscriptSegment( + start_sec=0, + end_sec=5, + text="안녕하세요. 오늘은 한국어를 공부해요.", + ) + ], + ) + + monkeypatch.setattr(lessons_api, "download_youtube_captions", fake_download) + monkeypatch.setattr( + lessons_api, + "generate_lesson_artifacts_from_transcript", + lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("Gemini unavailable")), + ) + + lessons_api.generate_lesson_artifacts_task(lesson_id) + + assert requested_ranges == [(0, 900), (0, 900)] + + response = client.get(f"/api/lessons/{lesson_id}", headers=headers) + assert response.status_code == 200 + assert response.json()["generationStatus"] == "ready" + assert response.json()["subtitleDone"] is True + assert response.json()["flashcardDone"] is False + assert response.json()["errorCode"] == "flashcard_generation_failed" + + response = client.get(f"/api/lessons/{lesson_id}/subtitles", headers=headers) + assert response.status_code == 200 + body = response.json() + assert body["youtubeId"] == "abc123XYZ00" + assert body["lines"][0]["korean"].startswith("안녕하세요") + assert body["lines"][0]["english"] == "" + + response = client.get(f"/api/lessons/{lesson_id}/flashcards", headers=headers) + assert response.status_code == 422 + assert response.json()["detail"]["code"] == "flashcard_generation_failed" diff --git a/tests/test_public_api.py b/tests/test_public_api.py index 44fec58..8fc3541 100644 --- a/tests/test_public_api.py +++ b/tests/test_public_api.py @@ -14,6 +14,7 @@ from fastapi.testclient import TestClient from sqlalchemy import create_engine from sqlalchemy.orm import Session, sessionmaker +from sqlalchemy.pool import StaticPool from app.db.base import Base from app.db.session import get_db @@ -28,8 +29,12 @@ TEST_DATABASE_URL = "sqlite:///:memory:" -engine = create_engine(TEST_DATABASE_URL, connect_args={"check_same_thread": False}) -TestingSessionLocal = sessionmaker(bind=engine) +engine = create_engine( + TEST_DATABASE_URL, + connect_args={"check_same_thread": False}, + poolclass=StaticPool, +) +TestingSessionLocal = sessionmaker(bind=engine, expire_on_commit=False) def override_get_db(): @@ -42,12 +47,11 @@ def override_get_db(): @pytest.fixture(autouse=True) def setup_db(): + app.dependency_overrides[get_db] = override_get_db Base.metadata.create_all(bind=engine) yield Base.metadata.drop_all(bind=engine) - - -app.dependency_overrides[get_db] = override_get_db + app.dependency_overrides.clear() client = TestClient(app) @@ -113,16 +117,18 @@ def setup_db(): } ] +_UNSET = object() + def _make_lesson( db: Session, *, is_preview: bool = True, generation_status: str = "ready", - flashcards_json: dict | None = None, - subtitles_json: dict | None = None, - watch_vocab_json: dict | None = None, - cultural_notes_json: list | None = None, + flashcards_json: dict | None | object = _UNSET, + subtitles_json: dict | None | object = _UNSET, + watch_vocab_json: dict | None | object = _UNSET, + cultural_notes_json: list | None | object = _UNSET, ) -> Lesson: lesson = Lesson( user_id=1, @@ -135,10 +141,10 @@ def _make_lesson( generation_status=generation_status, is_preview=is_preview, transcript_status="ready", - flashcards_json=flashcards_json or _FLASHCARDS_JSON, - subtitles_json=subtitles_json or _SUBTITLES_JSON, - watch_vocab_json=watch_vocab_json or _WATCH_VOCAB_JSON, - cultural_notes_json=cultural_notes_json or _CULTURAL_NOTES_JSON, + flashcards_json=_FLASHCARDS_JSON if flashcards_json is _UNSET else flashcards_json, + subtitles_json=_SUBTITLES_JSON if subtitles_json is _UNSET else subtitles_json, + watch_vocab_json=_WATCH_VOCAB_JSON if watch_vocab_json is _UNSET else watch_vocab_json, + cultural_notes_json=_CULTURAL_NOTES_JSON if cultural_notes_json is _UNSET else cultural_notes_json, raw_youtube_metadata={}, created_at=datetime.now(UTC), updated_at=datetime.now(UTC), @@ -272,6 +278,15 @@ def test_get_preview_subtitles_schema(): assert isinstance(data["culturalNotes"], list) +def test_get_preview_subtitles_falls_back_to_lesson_youtube_id(): + with TestingSessionLocal() as db: + lesson = _make_lesson(db, subtitles_json={"durationSec": 60, "lines": []}) + + data = client.get(f"{BASE}/lessons/{lesson.id}/subtitles").json() + + assert data["youtubeId"] == lesson.youtube_video_id + + def test_get_preview_subtitles_lines_shape(): with TestingSessionLocal() as db: lesson = _make_lesson(db) diff --git a/tests/test_transcripts.py b/tests/test_transcripts.py index 3abd562..29a1d0d 100644 --- a/tests/test_transcripts.py +++ b/tests/test_transcripts.py @@ -1,43 +1,11 @@ from pathlib import Path -import subprocess - +from unittest.mock import patch, MagicMock from app.services.transcripts import ( TranscriptSegment, - clean_caption_text, download_youtube_captions, filter_segments, - parse_vtt, ) - -def test_clean_caption_text_removes_vtt_markup(): - assert clean_caption_text("안녕<00:00:01.000>하세요& 반가워요") == "안녕하세요& 반가워요" - - -def test_parse_vtt_reads_caption_segments(tmp_path: Path): - path = tmp_path / "captions.ko.vtt" - path.write_text( - """WEBVTT -Kind: captions -Language: ko - -00:00:00.000 --> 00:00:02.500 -안녕하세요. - -00:00:02.500 --> 00:00:05.000 -오늘은 한국어를 공부해요. -""", - encoding="utf-8", - ) - - segments = parse_vtt(path) - - assert segments == [ - TranscriptSegment(start_sec=0.0, end_sec=2.5, text="안녕하세요."), - TranscriptSegment(start_sec=2.5, end_sec=5.0, text="오늘은 한국어를 공부해요."), - ] - - def test_filter_segments_keeps_overlapping_segments(): segments = [ TranscriptSegment(start_sec=0, end_sec=2, text="one"), @@ -49,21 +17,67 @@ def test_filter_segments_keeps_overlapping_segments(): assert filter_segments(segments, start_sec=2.1, end_sec=3.9) == [segments[1]] -def test_download_youtube_captions_uses_runner_and_parses_vtt(tmp_path: Path): - def runner(args: list[str]) -> subprocess.CompletedProcess[str]: - assert args[0] == "yt-dlp" - (tmp_path / "captions.ko.vtt").write_text( - """WEBVTT +@patch("app.services.transcripts.httpx.get") +@patch("app.services.transcripts.get_settings") +def test_download_youtube_captions_success(mock_get_settings, mock_get, tmp_path: Path): + mock_settings = MagicMock() + mock_settings.supadata_api_key = "test-key" + mock_get_settings.return_value = mock_settings + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "content": [ + {"text": "안녕하세요.", "offset": 0, "duration": 5000, "lang": "ko"}, + {"text": "서울의 길거리 음식입니다.", "offset": 5000, "duration": 5000, "lang": "ko"}, + ], + "lang": "ko" + } + mock_get.return_value = mock_response -00:00:00.000 --> 00:00:05.000 -안녕하세요. 오늘은 서울의 길거리 음식을 함께 즐겨볼게요. + transcript = download_youtube_captions( + "https://youtu.be/abc123XYZ00", + tmp_path, + lang="ko", + start_sec=0, + end_sec=10, + allow_auto=True + ) -00:00:05.000 --> 00:00:10.000 -이 시장은 현지인도 자주 와서 맛있는 음식 가게로 가득해요. -""", - encoding="utf-8", - ) - return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + assert transcript is not None + assert transcript.source == "youtube_caption" + assert "길거리 음식" in transcript.text + assert len(transcript.segments) == 2 + assert transcript.segments[0].start_sec == 0.0 + assert transcript.segments[1].end_sec == 10.0 + assert transcript.lang == "ko" + + +@patch("app.services.transcripts.httpx.get") +@patch("app.services.transcripts.get_settings") +def test_download_youtube_captions_accepts_regional_language_variant( + mock_get_settings, + mock_get, + tmp_path: Path, +): + mock_settings = MagicMock() + mock_settings.supadata_api_key = "test-key" + mock_get_settings.return_value = mock_settings + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "content": [ + { + "text": "안녕하세요. 오늘은 한국어 자막을 공부합니다.", + "offset": 0, + "duration": 5000, + "lang": "ko-KR", + }, + ], + "lang": "ko-KR", + } + mock_get.return_value = mock_response transcript = download_youtube_captions( "https://youtu.be/abc123XYZ00", @@ -72,10 +86,42 @@ def runner(args: list[str]) -> subprocess.CompletedProcess[str]: start_sec=0, end_sec=10, allow_auto=True, - runner=runner, ) assert transcript is not None - assert transcript.source == "youtube_auto_caption" - assert "길거리 음식" in transcript.text - assert len(transcript.segments) == 2 + assert transcript.lang == "ko-KR" + assert "한국어 자막" in transcript.text + + +@patch("app.services.transcripts.httpx.get") +@patch("app.services.transcripts.get_settings") +def test_download_youtube_captions_rejects_supadata_language_fallback( + mock_get_settings, + mock_get, + tmp_path: Path, +): + mock_settings = MagicMock() + mock_settings.supadata_api_key = "test-key" + mock_get_settings.return_value = mock_settings + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "content": [ + {"text": "안녕하세요.", "offset": 0, "duration": 5000, "lang": "ko"}, + ], + "lang": "ko", + "availableLangs": ["ko"], + } + mock_get.return_value = mock_response + + transcript = download_youtube_captions( + "https://youtu.be/abc123XYZ00", + tmp_path, + lang="en", + start_sec=0, + end_sec=10, + allow_auto=True, + ) + + assert transcript is None