diff --git a/.github/workflows/nightly-e2e.yml b/.github/workflows/nightly-e2e.yml new file mode 100644 index 000000000..07fb1807a --- /dev/null +++ b/.github/workflows/nightly-e2e.yml @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Nightly E2E Tests + +on: + schedule: + - cron: '0 0 * * *' + workflow_dispatch: + inputs: + flink-version: + description: 'Flink version to test (e.g., 2.2.0, 2.1.1)' + required: false + default: '2.2.0' + type: string + +jobs: + submit-examples-to-flink: + name: submit-examples [flink-${{ matrix.flink-version }}] + runs-on: ubuntu-latest + # Budget: download ~5m + build ~15m + cluster/submit ~5m + buffer. + timeout-minutes: 45 + env: + SKIP_SPOTLESS_CHECK: true + strategy: + fail-fast: false + matrix: + flink-version: ['2.2.0'] + steps: + - uses: actions/checkout@v4 + - name: Install java + uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: 'adopt' + - name: Install python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Install uv + run: | + curl -LsSf https://astral.sh/uv/0.11.0/install.sh | sh + echo "$HOME/.local/bin" >> $GITHUB_PATH + - name: Run submit-examples E2E test + env: + FLINK_VERSION: ${{ github.event.inputs.flink-version || matrix.flink-version }} + run: bash e2e-test/test-scripts/test_submit_examples_to_flink.sh + - name: Upload Flink logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: flink-logs-${{ matrix.flink-version }} + path: flink-logs-*.tar.gz + if-no-files-found: ignore diff --git a/e2e-test/test-scripts/test_submit_examples_to_flink.sh b/e2e-test/test-scripts/test_submit_examples_to_flink.sh new file mode 100755 index 000000000..7932c3b6d --- /dev/null +++ b/e2e-test/test-scripts/test_submit_examples_to_flink.sh @@ -0,0 +1,339 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Submits the Java/Python examples to a local Flink standalone cluster and +# checks that the JobManager accepts each one. The examples talk to remote +# LLM APIs, so we only verify submission, then cancel the jobs. +# +# Env: FLINK_VERSION (default 2.2.0), FLINK_HOME (reuse existing install), +# VERBOSE=1 (set -x). + +set -euo pipefail + +if [[ "${VERBOSE:-0}" == "1" ]]; then + set -x +fi + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { printf "${BLUE}[INFO]${NC} %s\n" "$*"; } +log_ok() { printf "${GREEN}[OK]${NC} %s\n" "$*"; } +log_warn() { printf "${YELLOW}[WARN]${NC} %s\n" "$*"; } +log_error() { printf "${RED}[ERROR]${NC} %s\n" "$*" >&2; } +log_section() { + printf "\n${BLUE}==============================================================${NC}\n" + printf "${BLUE}>>> %s${NC}\n" "$*" + printf "${BLUE}==============================================================${NC}\n" +} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.."; pwd)" +log_info "Project root: $ROOT_DIR" + +FLINK_VERSION="${FLINK_VERSION:-2.2.0}" +FLINK_MAJOR_MINOR="${FLINK_VERSION%.*}" +SUBMIT_TIMEOUT="${SUBMIT_TIMEOUT:-180}" + +# Bash 3 (default on macOS) lacks associative arrays. +RESULT_NAMES=() +RESULT_STATES=() +SUBMITTED_JOB_IDS=() + +cleanup() { + local exit_code=$? + log_section "Cleanup" + + if [[ -n "${FLINK_HOME:-}" && -x "$FLINK_HOME/bin/flink" ]]; then + for jid in "${SUBMITTED_JOB_IDS[@]:-}"; do + [[ -n "$jid" ]] || continue + log_info "Cancelling job $jid" + "$FLINK_HOME/bin/flink" cancel "$jid" >/dev/null 2>&1 || true + done + + if [[ -x "$FLINK_HOME/bin/stop-cluster.sh" ]]; then + log_info "Stopping Flink cluster" + "$FLINK_HOME/bin/stop-cluster.sh" >/dev/null 2>&1 || true + fi + + if [[ -d "$FLINK_HOME/log" ]]; then + local log_archive="$ROOT_DIR/flink-logs-$(date +%Y%m%d-%H%M%S).tar.gz" + tar -czf "$log_archive" -C "$FLINK_HOME" log >/dev/null 2>&1 \ + && log_info "Flink logs archived to: $log_archive" \ + || log_warn "Failed to archive Flink logs" + fi + fi + + print_summary + exit "$exit_code" +} +trap cleanup EXIT + +print_summary() { + log_section "Test summary" + local total=${#RESULT_NAMES[@]} + local passed=0 + local failed=0 + local i + for (( i = 0; i < total; i++ )); do + local name="${RESULT_NAMES[$i]}" + local state="${RESULT_STATES[$i]}" + if [[ "$state" == "PASS" ]]; then + printf " ${GREEN}PASS${NC} %s\n" "$name" + passed=$((passed + 1)) + else + printf " ${RED}FAIL${NC} %s\n" "$name" + failed=$((failed + 1)) + fi + done + printf "\nTotal: %d Passed: %d Failed: %d\n" "$total" "$passed" "$failed" + + if (( failed > 0 )); then + log_error "$failed example(s) failed to submit" + exit 1 + fi +} + +record_result() { + RESULT_NAMES+=("$1") + RESULT_STATES+=("$2") +} + +install_flink() { + log_section "Step 1: install Flink standalone (version $FLINK_VERSION)" + + if [[ -n "${FLINK_HOME:-}" && -x "$FLINK_HOME/bin/flink" ]]; then + log_info "Reusing existing FLINK_HOME: $FLINK_HOME" + export FLINK_HOME + return 0 + fi + + # Anchor VENV_DIR to the repo so we can find it after install.sh exits. + export VENV_DIR="${VENV_DIR:-$ROOT_DIR/.flink-agents-env}" + + log_info "Running tools/install.sh --non-interactive --install-flink --enable-pyflink" + FLINK_VERSION="$FLINK_VERSION" bash "$ROOT_DIR/tools/install.sh" \ + --non-interactive --install-flink --enable-pyflink + + local install_dir="${INSTALL_DIR:-$HOME/.local/flink}" + export FLINK_HOME="${install_dir}/flink-${FLINK_VERSION}" + + if [[ ! -x "$FLINK_HOME/bin/flink" ]]; then + log_error "Flink installation not found at expected path: $FLINK_HOME" + exit 1 + fi + log_ok "Flink installed at: $FLINK_HOME" + + # `flink run -py` shells out to a Python interpreter that must have + # pyflink importable. Activate the venv install.sh provisioned and + # point PYFLINK_CLIENT_EXECUTABLE at it. + if [[ ! -x "$VENV_DIR/bin/python" ]]; then + log_error "Expected Python venv not found at: $VENV_DIR" + exit 1 + fi + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + export PYFLINK_CLIENT_EXECUTABLE="$VENV_DIR/bin/python" + log_ok "Activated PyFlink venv: $VENV_DIR" +} + +build_project() { + log_section "Step 2: build flink-agents (Java + Python)" + ( + cd "$ROOT_DIR" + SKIP_SPOTLESS_CHECK=true bash tools/build.sh + ) + log_ok "Build completed" +} + +stage_dist_jars() { + log_section "Step 3: stage dist uber jar into \$FLINK_HOME/lib" + + local project_version + project_version=$(sed -n 's/.*\(.*\)<\/version>.*/\1/p' \ + "$ROOT_DIR/pom.xml" | head -n 2 | tail -n 1) + log_info "Detected project version: $project_version" + + # The flink-version uber jar already bundles the common deps. + local flink_jar="$ROOT_DIR/dist/flink-${FLINK_MAJOR_MINOR}/target/flink-agents-dist-flink-${FLINK_MAJOR_MINOR}-${project_version}.jar" + + if [[ ! -f "$flink_jar" ]]; then + log_error "Flink dist jar not found: $flink_jar" + exit 1 + fi + + cp "$flink_jar" "$FLINK_HOME/lib/" + log_ok "Staged: $(basename "$flink_jar")" +} + +package_examples() { + log_section "Step 4: package examples module" + ( + cd "$ROOT_DIR/examples" + mvn --batch-mode --no-transfer-progress package \ + -DskipTests -Dspotless.skip=true + ) + + EXAMPLES_JAR=$(ls "$ROOT_DIR"/examples/target/flink-agents-examples-*.jar \ + 2>/dev/null | grep -v 'sources\|javadoc\|original' | head -n 1 || true) + if [[ -z "$EXAMPLES_JAR" || ! -f "$EXAMPLES_JAR" ]]; then + log_error "Examples jar not found under examples/target/" + exit 1 + fi + log_ok "Examples jar: $EXAMPLES_JAR" +} + +start_cluster() { + log_section "Step 5: start Flink standalone cluster" + "$FLINK_HOME/bin/start-cluster.sh" + + local rest_url="http://localhost:8081" + log_info "Waiting for JobManager REST API at $rest_url ..." + local i + for (( i = 0; i < 60; i++ )); do + if curl -fsS "$rest_url/overview" >/dev/null 2>&1; then + log_ok "Flink cluster is up" + return 0 + fi + sleep 2 + done + + log_error "Flink cluster did not become ready in time" + exit 1 +} + +extract_job_id() { + # "Job has been submitted with JobID " + grep -Eo 'JobID [0-9a-f]{32}' "$1" | tail -n 1 | awk '{print $2}' +} + +verify_and_cancel_job() { + local job_id="$1" + if [[ -z "$job_id" ]]; then + return 1 + fi + SUBMITTED_JOB_IDS+=("$job_id") + + local i + for (( i = 0; i < 15; i++ )); do + if "$FLINK_HOME/bin/flink" list 2>/dev/null | grep -q "$job_id"; then + log_ok "Job $job_id is registered with the cluster" + "$FLINK_HOME/bin/flink" cancel "$job_id" >/dev/null 2>&1 || true + return 0 + fi + sleep 2 + done + + log_error "Job $job_id never appeared in 'flink list'" + return 1 +} + +submit_java_example() { + local class_name="$1" + local label="java:${class_name##*.}" + log_section "Submitting Java example: $class_name" + + local out + out=$(mktemp) + local rc=0 + if ! timeout "$SUBMIT_TIMEOUT" "$FLINK_HOME/bin/flink" run \ + --detached \ + -c "$class_name" \ + "$EXAMPLES_JAR" >"$out" 2>&1; then + rc=$? + fi + cat "$out" + + if (( rc != 0 )); then + log_error "$label submission failed (exit $rc)" + record_result "$label" "FAIL" + rm -f "$out" + return 0 + fi + + local job_id + job_id=$(extract_job_id "$out") + rm -f "$out" + if verify_and_cancel_job "$job_id"; then + record_result "$label" "PASS" + else + record_result "$label" "FAIL" + fi +} + +submit_python_example() { + local script_path="$1" + local label="python:$(basename "$script_path" .py)" + log_section "Submitting Python example: $script_path" + + if [[ ! -f "$script_path" ]]; then + log_error "Python example not found: $script_path" + record_result "$label" "FAIL" + return 0 + fi + + local out + out=$(mktemp) + local rc=0 + if ! timeout "$SUBMIT_TIMEOUT" "$FLINK_HOME/bin/flink" run \ + --detached \ + -py "$script_path" >"$out" 2>&1; then + rc=$? + fi + cat "$out" + + if (( rc != 0 )); then + log_error "$label submission failed (exit $rc)" + record_result "$label" "FAIL" + rm -f "$out" + return 0 + fi + + local job_id + job_id=$(extract_job_id "$out") + rm -f "$out" + if verify_and_cancel_job "$job_id"; then + record_result "$label" "PASS" + else + record_result "$label" "FAIL" + fi +} + +main() { + install_flink + build_project + stage_dist_jars + package_examples + start_cluster + + log_section "Step 6: submit Java examples" + submit_java_example "org.apache.flink.agents.examples.ReActAgentExample" + submit_java_example "org.apache.flink.agents.examples.WorkflowSingleAgentExample" + submit_java_example "org.apache.flink.agents.examples.WorkflowMultipleAgentExample" + + log_section "Step 7: submit Python examples" + submit_python_example "$ROOT_DIR/python/flink_agents/examples/quickstart/react_agent_example.py" + submit_python_example "$ROOT_DIR/python/flink_agents/examples/quickstart/workflow_single_agent_example.py" + submit_python_example "$ROOT_DIR/python/flink_agents/examples/quickstart/workflow_multiple_agent_example.py" +} + +main "$@"