#!/usr/bin/env bash
set -euo pipefail

status_file="${PWD}/status.json"
artifacts_dir="${PWD}/artifacts"
server_log="${artifacts_dir}/vllm.log"
install_log="${artifacts_dir}/install.log"
models_json="${artifacts_dir}/models.json"
probe_json="${artifacts_dir}/probe-response.json"
venv_dir="${PWD}/${VLLM_VENV_DIR:-.venv}"
venv_bin_dir="${venv_dir}/bin"

timestamp() {
  date -u +"%Y-%m-%dT%H:%M:%SZ"
}

write_status() {
  local state="$1"
  local message="$2"
  python3 - "$status_file" "$state" "$message" "${MODEL_NAME:-}" "${VLLM_PORT:-8000}" <<'PY'
import json
import sys
from datetime import datetime, timezone

path, state, message, model, port = sys.argv[1:]
payload = {
    "status": state,
    "message": message,
    "updatedAt": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
    "template": "llm-inference",
    "model": model,
    "port": port,
}
with open(path, "w", encoding="utf-8") as handle:
    json.dump(payload, handle, indent=2)
    handle.write("\n")
PY
}

wait_for_json_endpoint() {
  local url="$1"
  local output_path="$2"
  local timeout_seconds="$3"
  python3 - "$url" "$output_path" "$timeout_seconds" <<'PY'
import json
import sys
import time
import urllib.error
import urllib.request

url, output_path, timeout_seconds = sys.argv[1], sys.argv[2], int(sys.argv[3])
deadline = time.time() + timeout_seconds
last_error = "endpoint did not become ready"

while time.time() < deadline:
    try:
        with urllib.request.urlopen(url, timeout=20) as response:
            body = response.read().decode("utf-8")
        json.loads(body)
        with open(output_path, "w", encoding="utf-8") as handle:
            handle.write(body)
            if not body.endswith("\n"):
                handle.write("\n")
        sys.exit(0)
    except Exception as exc:  # noqa: BLE001
        last_error = str(exc)
        time.sleep(5)

print(last_error, file=sys.stderr)
sys.exit(1)
PY
}

run_probe_request() {
  local port="$1"
  local model="$2"
  local prompt="$3"
  local output_path="$4"
  python3 - "$port" "$model" "$prompt" "$output_path" <<'PY'
import json
import sys
import urllib.request

port, model, prompt, output_path = sys.argv[1:]
payload = {
    "model": model,
    "messages": [{"role": "user", "content": prompt}],
    "max_tokens": 64,
    "temperature": 0,
}
request = urllib.request.Request(
    f"http://127.0.0.1:{port}/v1/chat/completions",
    data=json.dumps(payload).encode("utf-8"),
    headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(request, timeout=120) as response:
    body = response.read().decode("utf-8")

parsed = json.loads(body)
if not parsed.get("choices"):
    raise SystemExit("probe response did not include any choices")

with open(output_path, "w", encoding="utf-8") as handle:
    json.dump(parsed, handle, indent=2)
    handle.write("\n")
PY
}

record_server_failure() {
  if [ -f "$server_log" ]; then
    tail -n 80 "$server_log" >"${artifacts_dir}/server-log-tail.txt" || true
  fi
}

has_vllm() {
  if [ -x "${venv_bin_dir}/python" ]; then
    "${venv_bin_dir}/python" -c "import vllm" >/dev/null 2>&1
    return
  fi
  python3 -c "import vllm" >/dev/null 2>&1
}

on_exit() {
  local rc=$?
  if [ "$rc" -ne 0 ]; then
    write_status "failed" "serve.sh exited with status ${rc}"
  fi
}

trap on_exit EXIT

mkdir -p "$artifacts_dir"
write_status "running" "Preparing vLLM startup workflow"

cat >"${artifacts_dir}/connection.txt" <<EOF
Template: llm-inference
Model: ${MODEL_NAME:-Qwen/Qwen2.5-0.5B-Instruct}
Port: ${VLLM_PORT:-8000}

This template now defaults to a real vLLM startup and probe request.
Set SIMULATE_ONLY=1 to skip the server launch and validate only the RunPod module wiring.

OpenAI-compatible checks from inside the pod:
  curl http://127.0.0.1:${VLLM_PORT:-8000}/v1/models
  curl http://127.0.0.1:${VLLM_PORT:-8000}/v1/chat/completions \\
    -H 'content-type: application/json' \\
    -d '{"model":"${MODEL_NAME:-Qwen/Qwen2.5-0.5B-Instruct}","messages":[{"role":"user","content":"Hello from RunPod"}]}'
EOF

if [ "${SIMULATE_ONLY:-1}" = "1" ]; then
  sleep "${SIMULATED_DURATION_SECONDS:-3}"
  cat >"${artifacts_dir}/launch-plan.txt" <<EOF
Simulation mode completed.

To restore the real-serving default, set:
  SIMULATE_ONLY=0

Current serving command:
${SERVE_COMMAND:-vllm serve <model> ...}
EOF
  write_status "completed" "Simulation complete"
  exit 0
fi

: "${SERVE_COMMAND:?SERVE_COMMAND must be set when SIMULATE_ONLY=0}"

if ! has_vllm; then
  : "${VLLM_INSTALL_COMMAND:?VLLM_INSTALL_COMMAND must be set when vllm is not preinstalled}"
  write_status "running" "Installing vLLM runtime"
  python3 -m venv "$venv_dir"
  bash -lc "source '${venv_bin_dir}/activate' && ${VLLM_INSTALL_COMMAND}" >"$install_log" 2>&1
fi

if [ -x "${venv_bin_dir}/vllm" ]; then
  export PATH="${venv_bin_dir}:${PATH}"
fi

write_status "running" "Launching OpenAI-compatible vLLM server"
nohup bash -lc "${SERVE_COMMAND}" >"$server_log" 2>&1 &
server_pid=$!
echo "$server_pid" >"${artifacts_dir}/server.pid"
sleep "${READINESS_DELAY_SECONDS:-5}"

if ! kill -0 "$server_pid" 2>/dev/null; then
  record_server_failure
  wait "$server_pid"
fi

write_status "running" "Waiting for vLLM API readiness"
if ! wait_for_json_endpoint \
  "http://127.0.0.1:${VLLM_PORT:-8000}/v1/models" \
  "$models_json" \
  "${VLLM_READINESS_TIMEOUT_SECONDS:-900}"; then
  record_server_failure
  if ! kill -0 "$server_pid" 2>/dev/null; then
    wait "$server_pid"
  fi
  exit 1
fi

write_status "running" "Issuing probe request to the OpenAI-compatible API"
run_probe_request \
  "${VLLM_PORT:-8000}" \
  "${MODEL_NAME:?MODEL_NAME must be set}" \
  "${PROBE_PROMPT:-Reply with a short greeting from RunPod.}" \
  "$probe_json"

write_status "completed" "vLLM server is running and the probe request succeeded"