#!/usr/bin/env bash
set -euo pipefail

INSTALLER_VERSION="voidburnctl-eks-enterprise-1.0.0"

die(){ echo "ERROR: $*" >&2; exit 1; }
need(){ command -v "$1" >/dev/null 2>&1 || die "missing dependency: $1"; }
utc(){ date -u +"%Y-%m-%dT%H:%M:%SZ"; }
tsid(){ date -u +"%Y%m%dT%H%M%SZ"; }

need aws
need kubectl
need eksctl
need jq
need cosign
need openssl
need base64

# -------- required env --------
AWS_PROFILE="${AWS_PROFILE:-}"
AWS_REGION="${AWS_REGION:-}"
CLUSTER_NAME="${CLUSTER_NAME:-}"
TARGET_NODEGROUP="${TARGET_NODEGROUP:-}"
TARGET_NODEPOOL="${TARGET_NODEPOOL:-}"

[[ -n "$AWS_PROFILE" && -n "$AWS_REGION" && -n "$CLUSTER_NAME" ]] \
  || die "Set AWS_PROFILE, AWS_REGION, CLUSTER_NAME"

if [[ -n "$TARGET_NODEGROUP" && -n "$TARGET_NODEPOOL" ]]; then
  die "Set only one of TARGET_NODEGROUP or TARGET_NODEPOOL"
fi
if [[ -z "$TARGET_NODEGROUP" && -z "$TARGET_NODEPOOL" ]]; then
  die "Set TARGET_NODEGROUP (ASG-backed) or TARGET_NODEPOOL (Karpenter NodePool/Provisioner)"
fi

# -------- optional env --------
TEMPLATE_URL="${TEMPLATE_URL:-https://voidburn-cfn-002904041222.s3.amazonaws.com/voidburn-sentinel-cfn.yaml}"
BACKEND_URL="${BACKEND_URL:-https://api.voidburn.com}"

AGENT_NAMESPACE="${AGENT_NAMESPACE:-voidburn}"
WORKLOAD_NAMESPACE="${WORKLOAD_NAMESPACE:-vb-workloads}"

ALLOW_LABEL_KEY="${ALLOW_LABEL_KEY:-voidburn.com/target}"
ALLOW_LABEL_VALUE="${ALLOW_LABEL_VALUE:-true}"
AUTO_LABEL_TARGETS="${AUTO_LABEL_TARGETS:-true}"

# Target selector (passed into CloudFormation; used by the in-cluster labeler)
TARGET_LABEL_KEY="eks.amazonaws.com/nodegroup"
TARGET_LABEL_VALUE="$TARGET_NODEGROUP"
TARGET_SCOPE="nodegroup:${TARGET_NODEGROUP}"
if [[ -n "$TARGET_NODEPOOL" ]]; then
  TARGET_LABEL_KEY="karpenter.sh/nodepool"
  TARGET_LABEL_VALUE="$TARGET_NODEPOOL"
  TARGET_SCOPE="nodepool:${TARGET_NODEPOOL}"
fi

CHECKPOINT_MODE="${CHECKPOINT_MODE:-strict}"
CHECKPOINT_STORAGE="${CHECKPOINT_STORAGE:-efs}"
CHECKPOINT_WINDOW_SECONDS="${CHECKPOINT_WINDOW_SECONDS:-300}"

WORKLOAD_SERVICE="${WORKLOAD_SERVICE:-voidburn-workload}"
WORKLOAD_CHECKPOINT_URL="${WORKLOAD_CHECKPOINT_URL:-http://${WORKLOAD_SERVICE}.${WORKLOAD_NAMESPACE}.svc.cluster.local:8080/checkpoint}"

# enterprise image options (choose one)
WORKLOAD_IMAGE="${WORKLOAD_IMAGE:-}"
KANIKO_PUSH_IMAGE="${KANIKO_PUSH_IMAGE:-}"
IMAGE_PULL_SECRET="${IMAGE_PULL_SECRET:-}"
KANIKO_REGISTRY_SECRET="${KANIKO_REGISTRY_SECRET:-registry-creds}"

if [[ -z "$WORKLOAD_IMAGE" && -z "$KANIKO_PUSH_IMAGE" ]]; then
  die "Set WORKLOAD_IMAGE (preferred enterprise path) OR KANIKO_PUSH_IMAGE (kaniko build mode)"
fi

# cosign gates
COSIGN_ISSUER="${COSIGN_ISSUER:-}"
COSIGN_IDENTITY_REGEX="${COSIGN_IDENTITY_REGEX:-}"
COSIGN_PUBKEY="${COSIGN_PUBKEY:-}"

if [[ -z "$COSIGN_PUBKEY" ]]; then
  [[ -n "$COSIGN_ISSUER" && -n "$COSIGN_IDENTITY_REGEX" ]] || die "Set COSIGN_PUBKEY or (COSIGN_ISSUER + COSIGN_IDENTITY_REGEX)"
fi

# Evidence folder
EVIDENCE_DIR="${EVIDENCE_DIR:-voidburn-evidence-$(tsid)}"
mkdir -p "$EVIDENCE_DIR"

# Log everything to evidence
exec > >(tee -a "$EVIDENCE_DIR/install.log") 2>&1

AWS="aws --profile $AWS_PROFILE --region $AWS_REGION"
ASG_NAME=""
CAPACITY_TYPE=""

echo "== Voidburn Enterprise Install =="
echo "installer=$INSTALLER_VERSION"
echo "utc=$(utc)"
echo "cluster=$CLUSTER_NAME region=$AWS_REGION target=$TARGET_SCOPE"
echo "evidenceDir=$EVIDENCE_DIR"
echo

# ---------- helpers ----------
pass="PASS"
fail="FAIL"
PASS_IDENTITY="$fail"; PASS_KUBECTL="$fail"; PASS_NODEGROUP="$fail"
PASS_SCALER="$fail"; PASS_AUTOSCALER_TAGS="$fail"; PASS_AUTOSCALER_POLICY="$fail"
PASS_COSIGN="$fail"; PASS_SBOM="$fail"
PASS_IAM_SIM="$fail"
PASS_EFS="$fail"; PASS_SECRETS="$fail"; PASS_STRICT_SMOKE="$fail"; PASS_SENTINEL="$fail"
PASS_DRYRUN="$fail"

write_summary(){
  local out="$EVIDENCE_DIR/EVIDENCE_SUMMARY.md"
  sed \
	-e "s/{{CLUSTER_NAME}}/$CLUSTER_NAME/g" \
	-e "s/{{AWS_REGION}}/$AWS_REGION/g" \
	-e "s/{{UTC_TIMESTAMP}}/$(utc)/g" \
	-e "s/{{TARGET_NODEGROUP}}/$TARGET_SCOPE/g" \
	-e "s/{{INSTALLER_VERSION}}/$INSTALLER_VERSION/g" \
	-e "s/{{PASS_IDENTITY}}/$PASS_IDENTITY/g" \
	-e "s/{{PASS_KUBECTL}}/$PASS_KUBECTL/g" \
	-e "s/{{PASS_NODEGROUP}}/$PASS_NODEGROUP/g" \
    -e "s/{{PASS_SCALER}}/$PASS_SCALER/g" \
    -e "s/{{PASS_AUTOSCALER_TAGS}}/$PASS_AUTOSCALER_TAGS/g" \
    -e "s/{{PASS_AUTOSCALER_POLICY}}/$PASS_AUTOSCALER_POLICY/g" \
    -e "s/{{PASS_COSIGN}}/$PASS_COSIGN/g" \
    -e "s/{{PASS_SBOM}}/$PASS_SBOM/g" \
    -e "s/{{PASS_IAM_SIM}}/$PASS_IAM_SIM/g" \
    -e "s/{{PASS_EFS}}/$PASS_EFS/g" \
    -e "s/{{PASS_SECRETS}}/$PASS_SECRETS/g" \
    -e "s/{{PASS_STRICT_SMOKE}}/$PASS_STRICT_SMOKE/g" \
    -e "s/{{PASS_SENTINEL}}/$PASS_SENTINEL/g" \
    -e "s/{{PASS_DRYRUN}}/$PASS_DRYRUN/g" \
    templates/EVIDENCE_SUMMARY.template.md > "$out"
}

verify_cosign(){
  local img="$1"
  echo "Cosign verify gate for image: $img"
  if [[ -n "$COSIGN_PUBKEY" ]]; then
    cosign verify --key "$COSIGN_PUBKEY" "$img" --output json | tee "$EVIDENCE_DIR/cosign_verify_workload.json" >/dev/null \
      || die "cosign verify failed (pubkey)"
    cosign download sbom --key "$COSIGN_PUBKEY" "$img" > "$EVIDENCE_DIR/workload.sbom.json" 2>/dev/null || true
  else
    cosign verify "$img" \
      --certificate-oidc-issuer "$COSIGN_ISSUER" \
      --certificate-identity-regexp "$COSIGN_IDENTITY_REGEX" \
      --output json | tee "$EVIDENCE_DIR/cosign_verify_workload.json" >/dev/null \
      || die "cosign verify failed (keyless)"
    cosign download sbom "$img" > "$EVIDENCE_DIR/workload.sbom.json" 2>/dev/null || true
  fi

  PASS_COSIGN="$pass"
  if [[ -s "$EVIDENCE_DIR/workload.sbom.json" ]]; then PASS_SBOM="$pass"; else PASS_SBOM="WARN"; fi
}

# ---------- step 1: identity ----------
echo "[1] AWS identity"
if $AWS sts get-caller-identity | tee "$EVIDENCE_DIR/aws_identity.json" >/dev/null; then
  PASS_IDENTITY="$pass"
else
  die "AWS identity failed"
fi

echo "[2] kubeconfig + cluster reachability"
$AWS eks update-kubeconfig --name "$CLUSTER_NAME" >/dev/null
if kubectl get nodes -o wide | tee "$EVIDENCE_DIR/nodes_initial.txt" >/dev/null; then
  PASS_KUBECTL="$pass"
else
  die "kubectl cannot reach cluster"
fi

# ---------- step 2: nodegroup validation ----------
echo "[3] Validate target scope"
$AWS eks list-nodegroups --cluster-name "$CLUSTER_NAME" > "$EVIDENCE_DIR/eks_list_nodegroups.json"

if [[ -n "$TARGET_NODEPOOL" ]]; then
  if kubectl get nodepools.karpenter.sh "$TARGET_NODEPOOL" -o yaml > "$EVIDENCE_DIR/target_nodepool.yaml" 2>/dev/null; then
    PASS_NODEGROUP="$pass"
    TARGET_LABEL_KEY="karpenter.sh/nodepool"
    TARGET_LABEL_VALUE="$TARGET_NODEPOOL"
    TARGET_SCOPE="nodepool:${TARGET_NODEPOOL}"
    echo "OK target=karpenter nodepool=$TARGET_NODEPOOL"
  elif kubectl get provisioners.karpenter.sh "$TARGET_NODEPOOL" -o yaml > "$EVIDENCE_DIR/target_provisioner.yaml" 2>/dev/null; then
    PASS_NODEGROUP="$pass"
    TARGET_LABEL_KEY="karpenter.sh/provisioner-name"
    TARGET_LABEL_VALUE="$TARGET_NODEPOOL"
    TARGET_SCOPE="provisioner:${TARGET_NODEPOOL}"
    echo "OK target=karpenter provisioner=$TARGET_NODEPOOL (legacy)"
  else
    die "TARGET_NODEPOOL not found as NodePool or Provisioner: $TARGET_NODEPOOL"
  fi
else
  ASG_NAME="$($AWS eks describe-nodegroup --cluster-name "$CLUSTER_NAME" --nodegroup-name "$TARGET_NODEGROUP" \
    --query 'nodegroup.resources.autoScalingGroups[0].name' --output text)"
  [[ -n "$ASG_NAME" && "$ASG_NAME" != "None" ]] || die "target is not ASG-backed"
  CAPACITY_TYPE="$($AWS eks describe-nodegroup --cluster-name "$CLUSTER_NAME" --nodegroup-name "$TARGET_NODEGROUP" \
    --query 'nodegroup.capacityType' --output text)"
  [[ "$CAPACITY_TYPE" == "ON_DEMAND" ]] || die "capacityType=$CAPACITY_TYPE (need ON_DEMAND)"
  PASS_NODEGROUP="$pass"
  echo "OK ASG=$ASG_NAME capacity=$CAPACITY_TYPE"
fi

# ---------- step 3: IRSA ----------
echo "[4] IRSA OIDC"
eksctl utils associate-iam-oidc-provider --cluster "$CLUSTER_NAME" --region "$AWS_REGION" --profile "$AWS_PROFILE" --approve >/dev/null || true

# ---------- step 4: scaler ----------
echo "[5] Ensure scaler"
AUTOSCALER_TAG_ENABLED_KEY="k8s.io/cluster-autoscaler/enabled"
AUTOSCALER_TAG_CLUSTER_KEY="k8s.io/cluster-autoscaler/${CLUSTER_NAME}"
AUTOSCALER_TAG_CLUSTER_VAL="owned"

detect_karpenter_deploy(){
  # Return "namespace/deployment" if Karpenter controller is installed.
  local ns dep found
  for ns in karpenter kube-system; do
    for dep in karpenter karpenter-controller; do
      if kubectl -n "$ns" get deploy "$dep" >/dev/null 2>&1; then
        echo "$ns/$dep"
        return 0
      fi
    done
  done

  # Fallback: any deployment matching common Karpenter labels.
  found="$(kubectl get deploy -A -o json 2>/dev/null | jq -r '
    .items[]
    | select(
        (.metadata.name == "karpenter" or .metadata.name == "karpenter-controller") or
        (.metadata.labels["app.kubernetes.io/name"] == "karpenter") or
        (.metadata.labels["app.kubernetes.io/part-of"] == "karpenter") or
        (.metadata.labels.app == "karpenter")
      )
    | "\(.metadata.namespace)/\(.metadata.name)"
  ' | head -n1)"
  [[ -n "$found" ]] || return 1
  echo "$found"
}

SCALER_TYPE="cluster-autoscaler"
KARPENTER_NS=""
KARPENTER_DEP=""
KARPENTER_DEPLOY="$(detect_karpenter_deploy || true)"

# Karpenter target requires the Karpenter controller to exist; Cluster Autoscaler is not sufficient.
if [[ -n "$TARGET_NODEPOOL" ]]; then
  [[ -n "$KARPENTER_DEPLOY" ]] || die "TARGET_NODEPOOL requires Karpenter (controller deployment not found)"
  SCALER_TYPE="karpenter"
  PASS_SCALER="$pass"
  PASS_AUTOSCALER_TAGS="SKIP"
  PASS_AUTOSCALER_POLICY="SKIP"
  KARPENTER_NS="${KARPENTER_DEPLOY%/*}"
  KARPENTER_DEP="${KARPENTER_DEPLOY#*/}"
  echo "Karpenter present: ${KARPENTER_NS}/${KARPENTER_DEP}"
  kubectl -n "$KARPENTER_NS" rollout status deploy/"$KARPENTER_DEP" --timeout=3m >/dev/null || die "karpenter not Ready"
  kubectl -n "$KARPENTER_NS" get deploy "$KARPENTER_DEP" -o yaml > "$EVIDENCE_DIR/karpenter_deploy.yaml" 2>/dev/null || true
  kubectl -n "$KARPENTER_NS" logs deploy/"$KARPENTER_DEP" --tail=250 > "$EVIDENCE_DIR/karpenter_tail.log" 2>/dev/null || true
  kubectl get nodepools.karpenter.sh -A -o yaml > "$EVIDENCE_DIR/karpenter_nodepools.yaml" 2>/dev/null || true
  kubectl get ec2nodeclasses.karpenter.k8s.aws -A -o yaml > "$EVIDENCE_DIR/karpenter_ec2nodeclasses.yaml" 2>/dev/null || true
elif [[ -n "$KARPENTER_DEPLOY" ]] && ! kubectl -n kube-system get deploy cluster-autoscaler >/dev/null 2>&1; then
# Prefer Cluster Autoscaler when present (it scales ASG-backed nodegroups).
# If CA is absent but Karpenter exists, accept Karpenter as the scaler.
  SCALER_TYPE="karpenter"
  PASS_SCALER="$pass"
  PASS_AUTOSCALER_TAGS="SKIP"
  PASS_AUTOSCALER_POLICY="SKIP"
  KARPENTER_NS="${KARPENTER_DEPLOY%/*}"
  KARPENTER_DEP="${KARPENTER_DEPLOY#*/}"
  echo "Karpenter present: ${KARPENTER_NS}/${KARPENTER_DEP}"
  kubectl -n "$KARPENTER_NS" rollout status deploy/"$KARPENTER_DEP" --timeout=3m >/dev/null || die "karpenter not Ready"
  kubectl -n "$KARPENTER_NS" get deploy "$KARPENTER_DEP" -o yaml > "$EVIDENCE_DIR/karpenter_deploy.yaml" 2>/dev/null || true
  kubectl -n "$KARPENTER_NS" logs deploy/"$KARPENTER_DEP" --tail=250 > "$EVIDENCE_DIR/karpenter_tail.log" 2>/dev/null || true
  kubectl get nodepools.karpenter.sh -A -o yaml > "$EVIDENCE_DIR/karpenter_nodepools.yaml" 2>/dev/null || true
	kubectl get ec2nodeclasses.karpenter.k8s.aws -A -o yaml > "$EVIDENCE_DIR/karpenter_ec2nodeclasses.yaml" 2>/dev/null || true
else
  # Create tag-scoped IAM policy for autoscaler scale actions
  POLICY_NAME="VoidburnClusterAutoscalerPolicy-${CLUSTER_NAME}"
  POLICY_DOC="$EVIDENCE_DIR/cluster_autoscaler_policy_tag_scoped.json"
  cat > "$POLICY_DOC" <<JSON
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Sid": "ReadOnly",
      "Effect": "Allow",
      "Action": [
        "autoscaling:DescribeAutoScalingGroups",
        "autoscaling:DescribeAutoScalingInstances",
        "autoscaling:DescribeLaunchConfigurations",
        "autoscaling:DescribeTags",
        "ec2:DescribeInstances",
        "ec2:DescribeLaunchTemplateVersions",
        "ec2:DescribeInstanceTypes",
        "ec2:DescribeImages",
        "ec2:DescribeSubnets",
        "ec2:DescribeVpcs",
        "ec2:DescribeSecurityGroups",
        "ec2:DescribeAvailabilityZones",
        "eks:DescribeNodegroup"
      ],
      "Resource": "*"
    },
    {
      "Sid": "ScaleOnlyTaggedASGs",
      "Effect": "Allow",
      "Action": [
        "autoscaling:SetDesiredCapacity",
        "autoscaling:UpdateAutoScalingGroup",
        "autoscaling:TerminateInstanceInAutoScalingGroup"
      ],
      "Resource": "*",
      "Condition": {
        "StringEquals": {
          "autoscaling:ResourceTag/${AUTOSCALER_TAG_ENABLED_KEY}": "true",
          "autoscaling:ResourceTag/${AUTOSCALER_TAG_CLUSTER_KEY}": "${AUTOSCALER_TAG_CLUSTER_VAL}"
        }
      }
    }
  ]
}
JSON

  POLICY_ARN="$($AWS iam list-policies --scope Local --query "Policies[?PolicyName=='${POLICY_NAME}'].Arn | [0]" --output text)"
  if [[ -z "$POLICY_ARN" || "$POLICY_ARN" == "None" ]]; then
    POLICY_ARN="$($AWS iam create-policy --policy-name "$POLICY_NAME" --policy-document "file://$POLICY_DOC" --query 'Policy.Arn' --output text)"
  fi
  echo "$POLICY_ARN" > "$EVIDENCE_DIR/cluster_autoscaler_policy_arn.txt"
  PASS_AUTOSCALER_POLICY="$pass"

  # IRSA SA
  if ! kubectl -n kube-system get sa cluster-autoscaler >/dev/null 2>&1; then
    eksctl create iamserviceaccount \
      --cluster "$CLUSTER_NAME" --region "$AWS_REGION" --profile "$AWS_PROFILE" \
      --namespace kube-system --name cluster-autoscaler \
      --attach-policy-arn "$POLICY_ARN" \
      --approve >/dev/null
  fi

  # Deploy autoscaler if missing (EKS 1.29)
  if ! kubectl -n kube-system get deploy cluster-autoscaler >/dev/null 2>&1; then
    kubectl apply -f https://raw.githubusercontent.com/kubernetes/autoscaler/cluster-autoscaler-1.29.0/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml >/dev/null
    kubectl -n kube-system patch deploy cluster-autoscaler --type='json' -p='[
      {"op":"replace","path":"/spec/template/spec/serviceAccountName","value":"cluster-autoscaler"}
    ]' >/dev/null || true
  fi

  # Avoid surge=1 on a 1-replica deployment: small clusters may not be able to schedule 2x 600Mi during rollout.
  kubectl -n kube-system patch deploy cluster-autoscaler --type='merge' -p '{
    "spec": {
      "strategy": {
        "type": "RollingUpdate",
        "rollingUpdate": { "maxSurge": 0, "maxUnavailable": 1 }
      }
    }
  }' >/dev/null || true

  kubectl -n kube-system set image deploy/cluster-autoscaler \
    cluster-autoscaler=registry.k8s.io/autoscaling/cluster-autoscaler:v1.29.0 >/dev/null

  # The upstream manifest uses `command:` (not `args:`) and includes a <YOUR CLUSTER NAME> placeholder.
  # Replace the full command list deterministically to ensure autodiscovery works.
  AUTODISCOVERY="asg:tag=${AUTOSCALER_TAG_ENABLED_KEY},${AUTOSCALER_TAG_CLUSTER_KEY}"
  CA_PATCH="$(jq -nc --arg cluster "$CLUSTER_NAME" --arg autod "$AUTODISCOVERY" '[
    {
      op: "replace",
      path: "/spec/template/spec/containers/0/command",
      value: [
        "./cluster-autoscaler",
        "--cloud-provider=aws",
        "--namespace=kube-system",
        ("--cluster-name=" + $cluster),
        ("--node-group-auto-discovery=" + $autod),
        "--balance-similar-node-groups",
        "--skip-nodes-with-system-pods=false",
        "--skip-nodes-with-local-storage=false",
        "--stderrthreshold=info",
        "--v=4"
      ]
    }
  ]')"
  kubectl -n kube-system patch deploy cluster-autoscaler --type='json' -p="$CA_PATCH" >/dev/null

  # Tag ASGs correctly (owned)
  for NG in $($AWS eks list-nodegroups --cluster-name "$CLUSTER_NAME" --query 'nodegroups[]' --output text); do
    NG_ASG="$($AWS eks describe-nodegroup --cluster-name "$CLUSTER_NAME" --nodegroup-name "$NG" \
      --query 'nodegroup.resources.autoScalingGroups[0].name' --output text)"
    [[ -n "$NG_ASG" && "$NG_ASG" != "None" ]] || continue
    $AWS autoscaling create-or-update-tags --tags \
      "ResourceId=$NG_ASG,ResourceType=auto-scaling-group,Key=${AUTOSCALER_TAG_ENABLED_KEY},Value=true,PropagateAtLaunch=true" \
      "ResourceId=$NG_ASG,ResourceType=auto-scaling-group,Key=${AUTOSCALER_TAG_CLUSTER_KEY},Value=${AUTOSCALER_TAG_CLUSTER_VAL},PropagateAtLaunch=true" >/dev/null
  done
  PASS_AUTOSCALER_TAGS="$pass"

  kubectl -n kube-system rollout status deploy/cluster-autoscaler --timeout=3m >/dev/null || die "autoscaler not Ready"
  PASS_SCALER="$pass"
fi

# ---------- step 5: cosign gate ----------
echo "[6] Cosign gate (workload image)"
if [[ -n "$WORKLOAD_IMAGE" ]]; then
  verify_cosign "$WORKLOAD_IMAGE"
else
  echo "WORKLOAD_IMAGE not set; will verify after Kaniko build"
fi

# ---------- step 6: stack install ----------
echo "[7] Install Sentinel stack (CFN)"
read -s -p "Voidburn API key: " VOIDBURN_API_KEY; echo
[[ -n "$VOIDBURN_API_KEY" ]] || die "Voidburn API key empty"

# If WORKLOAD_CHECKPOINT_SECRET is not provided, reuse the existing in-cluster value when present.
# This keeps reruns deterministic (and avoids mismatching Sentinel vs receiver secrets on a completed stack).
if [[ -z "${WORKLOAD_CHECKPOINT_SECRET:-}" ]]; then
  existing_b64="$(kubectl -n "$AGENT_NAMESPACE" get secret voidburn-checkpoint-secret -o jsonpath='{.data.secret}' 2>/dev/null || true)"
  existing=""
  if [[ -n "$existing_b64" ]]; then
    existing="$(echo "$existing_b64" | base64 -d 2>/dev/null || true)"
  fi
  if [[ -n "$existing" ]]; then
    WORKLOAD_CHECKPOINT_SECRET="$existing"
  else
    WORKLOAD_CHECKPOINT_SECRET="$(openssl rand -hex 32)"
  fi
fi

STACK_NAME="${STACK_NAME:-voidburn-sentinel-${CLUSTER_NAME}-$(date +%s)}"
if $AWS cloudformation describe-stacks --stack-name "$STACK_NAME" >/dev/null 2>&1; then
  STACK_STATUS="$($AWS cloudformation describe-stacks --stack-name "$STACK_NAME" --query 'Stacks[0].StackStatus' --output text)"
  echo "Stack exists: $STACK_NAME status=$STACK_STATUS"
  case "$STACK_STATUS" in
    CREATE_COMPLETE|UPDATE_COMPLETE) : ;;
    *) die "stack exists but is not complete (status=$STACK_STATUS). Delete it or set STACK_NAME to a new value." ;;
  esac
else
  $AWS cloudformation create-stack \
    --stack-name "$STACK_NAME" \
    --template-url "$TEMPLATE_URL" \
    --capabilities CAPABILITY_NAMED_IAM \
    --parameters \
      ParameterKey=ClusterName,ParameterValue="$CLUSTER_NAME" \
      ParameterKey=Namespace,ParameterValue="$AGENT_NAMESPACE" \
      ParameterKey=WorkloadNamespace,ParameterValue="$WORKLOAD_NAMESPACE" \
      ParameterKey=ApiKey,ParameterValue="$VOIDBURN_API_KEY" \
      ParameterKey=BackendUrl,ParameterValue="$BACKEND_URL" \
      ParameterKey=AllowLabelKey,ParameterValue="$ALLOW_LABEL_KEY" \
      ParameterKey=AllowLabelValue,ParameterValue="$ALLOW_LABEL_VALUE" \
      ParameterKey=AutoLabelTargets,ParameterValue="$AUTO_LABEL_TARGETS" \
      ParameterKey=TargetNodegroupLabelKey,ParameterValue="$TARGET_LABEL_KEY" \
      ParameterKey=TargetNodegroupName,ParameterValue="$TARGET_LABEL_VALUE" \
      ParameterKey=CheckpointMode,ParameterValue="$CHECKPOINT_MODE" \
      ParameterKey=CheckpointTriggerEnabled,ParameterValue=true \
      ParameterKey=WorkloadCheckpointUrl,ParameterValue="$WORKLOAD_CHECKPOINT_URL" \
      ParameterKey=WorkloadCheckpointSecret,ParameterValue="$WORKLOAD_CHECKPOINT_SECRET" \
      ParameterKey=WorkloadRequireValidated,ParameterValue=true \
      ParameterKey=WorkloadRequireSignature,ParameterValue=true \
      ParameterKey=CheckpointStorage,ParameterValue="$CHECKPOINT_STORAGE" \
      ParameterKey=CreateControlNodegroup,ParameterValue=true \
      ParameterKey=PinAgentToProtectedNode,ParameterValue=true >/dev/null

  $AWS cloudformation wait stack-create-complete --stack-name "$STACK_NAME" || die "stack create failed"
fi
$AWS cloudformation describe-stacks --stack-name "$STACK_NAME" > "$EVIDENCE_DIR/stack_describe.json"

# ---------- step 7: secrets hardening ----------
echo "[8] Hardening secrets (prevents empty API_KEY + receiver/workload mismatch)"
kubectl get ns "$AGENT_NAMESPACE" >/dev/null 2>&1 || kubectl create ns "$AGENT_NAMESPACE" >/dev/null
kubectl get ns "$WORKLOAD_NAMESPACE" >/dev/null 2>&1 || kubectl create ns "$WORKLOAD_NAMESPACE" >/dev/null

kubectl -n "$AGENT_NAMESPACE" delete secret voidburn-agent --ignore-not-found >/dev/null 2>&1 || true
kubectl -n "$AGENT_NAMESPACE" create secret generic voidburn-agent --from-literal=apiKey="$VOIDBURN_API_KEY" >/dev/null
[[ "$(kubectl -n "$AGENT_NAMESPACE" get secret voidburn-agent -o jsonpath='{.data.apiKey}' | wc -c)" -gt 0 ]] || die "voidburn-agent secret empty"

kubectl -n "$WORKLOAD_NAMESPACE" delete secret vb-workload-secret --ignore-not-found >/dev/null 2>&1 || true
kubectl -n "$WORKLOAD_NAMESPACE" delete secret voidburn-checkpoint-secret --ignore-not-found >/dev/null 2>&1 || true
kubectl -n "$WORKLOAD_NAMESPACE" create secret generic vb-workload-secret --from-literal=secret="$WORKLOAD_CHECKPOINT_SECRET" >/dev/null
kubectl -n "$WORKLOAD_NAMESPACE" create secret generic voidburn-checkpoint-secret --from-literal=secret="$WORKLOAD_CHECKPOINT_SECRET" >/dev/null

# Strict secret contract: ensure Sentinel and receiver use the same checkpoint secret.
kubectl -n "$AGENT_NAMESPACE" delete secret voidburn-checkpoint-secret --ignore-not-found >/dev/null 2>&1 || true
kubectl -n "$AGENT_NAMESPACE" create secret generic voidburn-checkpoint-secret --from-literal=secret="$WORKLOAD_CHECKPOINT_SECRET" >/dev/null

# Secrets are read via env vars; restart deployments to pick up new values deterministically.
if kubectl -n "$AGENT_NAMESPACE" get deploy sentinel-sentinel >/dev/null 2>&1; then
  kubectl -n "$AGENT_NAMESPACE" rollout restart deploy/sentinel-sentinel >/dev/null || true
  kubectl -n "$AGENT_NAMESPACE" rollout status deploy/sentinel-sentinel --timeout=3m >/dev/null || die "sentinel not Ready after secret update"
fi
if kubectl -n "$WORKLOAD_NAMESPACE" get deploy voidburn-checkpoint >/dev/null 2>&1; then
  kubectl -n "$WORKLOAD_NAMESPACE" rollout restart deploy/voidburn-checkpoint >/dev/null || true
  kubectl -n "$WORKLOAD_NAMESPACE" rollout status deploy/voidburn-checkpoint --timeout=3m >/dev/null || die "receiver not Ready after secret update"
fi
PASS_SECRETS="$pass"

# ---------- step 8: storage gate ----------
echo "[9] Storage gate (efs RWX required when storage=efs)"
if [[ "$CHECKPOINT_STORAGE" == "efs" ]]; then
  kubectl -n kube-system get pods | egrep -qi 'efs-csi' || die "efs-csi not running"
  kubectl get storageclass voidburn-efs >/dev/null 2>&1 || die "storageclass voidburn-efs missing"
  kubectl -n "$WORKLOAD_NAMESPACE" get pvc checkpoint-pvc >/dev/null 2>&1 || die "checkpoint-pvc missing"

  # PVC binding can take several minutes on fresh clusters while EFS CSI becomes Ready.
  PVC_WAIT_SECONDS=900
  deadline=$(( $(date +%s) + PVC_WAIT_SECONDS ))
  phase=""
  while [[ $(date +%s) -lt $deadline ]]; do
    phase="$(kubectl -n "$WORKLOAD_NAMESPACE" get pvc checkpoint-pvc -o jsonpath='{.status.phase}' 2>/dev/null || true)"
    [[ "$phase" == "Bound" ]] && break
    sleep 5
  done

  if [[ "$phase" != "Bound" ]]; then
    kubectl -n "$WORKLOAD_NAMESPACE" get pvc checkpoint-pvc -o yaml > "$EVIDENCE_DIR/checkpoint_pvc.yaml" 2>/dev/null || true
    kubectl -n "$WORKLOAD_NAMESPACE" describe pvc checkpoint-pvc > "$EVIDENCE_DIR/checkpoint_pvc.describe.txt" 2>/dev/null || true
    kubectl -n "$WORKLOAD_NAMESPACE" get pod voidburn-checkpoint-binder -o yaml > "$EVIDENCE_DIR/checkpoint_binder_pod.yaml" 2>/dev/null || true
    kubectl -n "$WORKLOAD_NAMESPACE" describe pod voidburn-checkpoint-binder > "$EVIDENCE_DIR/checkpoint_binder_pod.describe.txt" 2>/dev/null || true
    kubectl -n "$WORKLOAD_NAMESPACE" get events --sort-by=.lastTimestamp > "$EVIDENCE_DIR/${WORKLOAD_NAMESPACE}_events.txt" 2>/dev/null || true
    kubectl -n kube-system get pods -o wide > "$EVIDENCE_DIR/kube_system_pods.txt" 2>/dev/null || true
    kubectl -n kube-system get events --sort-by=.lastTimestamp > "$EVIDENCE_DIR/kube_system_events.txt" 2>/dev/null || true
    die "checkpoint-pvc not Bound (phase=$phase)"
  fi

  kubectl -n "$WORKLOAD_NAMESPACE" get pvc checkpoint-pvc -o yaml > "$EVIDENCE_DIR/checkpoint_pvc.yaml"
  PASS_EFS="$pass"
else
  PASS_EFS="SKIP"
fi

# ---------- step 9: workload deploy ----------
echo "[10] Deploy workload (BYO image or Kaniko build)"
PULL_SECRET_YAML=""
if [[ -n "$IMAGE_PULL_SECRET" ]]; then
  kubectl -n "$WORKLOAD_NAMESPACE" get secret "$IMAGE_PULL_SECRET" >/dev/null || die "IMAGE_PULL_SECRET not found in vb-workloads"
  PULL_SECRET_YAML="imagePullSecrets: [{name: $IMAGE_PULL_SECRET}]"
fi

if [[ -z "$WORKLOAD_IMAGE" ]]; then
  echo "Kaniko mode requested; building to $KANIKO_PUSH_IMAGE"
  kubectl create ns voidburn-build >/dev/null 2>&1 || true
  kubectl -n voidburn-build get secret "$KANIKO_REGISTRY_SECRET" >/dev/null 2>&1 || die "Missing voidburn-build/$KANIKO_REGISTRY_SECRET (config.json)"

  kubectl -n voidburn-build delete job kaniko-build-workload --ignore-not-found >/dev/null 2>&1 || true
  kubectl -n voidburn-build delete configmap vb-workload-buildctx --ignore-not-found >/dev/null 2>&1 || true

  # Build context is embedded (minimal strict checkpoint server)
  cat > "$EVIDENCE_DIR/kaniko_server.py" <<'PY'
import os, json, hmac, hashlib
from datetime import datetime
from http.server import BaseHTTPRequestHandler, HTTPServer
PORT=int(os.getenv("PORT","8080"))
CHECKPOINT_DIR=os.getenv("CHECKPOINT_DIR","/checkpoints")
SECRET=os.getenv("WORKLOAD_CHECKPOINT_SECRET","").encode()
def sha(b): return hashlib.sha256(b).hexdigest()
def sig(b): return hmac.new(SECRET,b,hashlib.sha256).hexdigest()
class H(BaseHTTPRequestHandler):
  def send(self, code, obj):
    b=json.dumps(obj).encode()
    self.send_response(code); self.send_header("Content-Type","application/json"); self.send_header("Content-Length",str(len(b))); self.end_headers(); self.wfile.write(b)
  def do_POST(self):
    if self.path!="/checkpoint": return self.send(404,{"ok":False})
    if not SECRET: return self.send(500,{"ok":False,"error":"missing_secret"})
    auth=self.headers.get("Authorization","")
    if not auth.startswith("Bearer "): return self.send(401,{"ok":False,"error":"missing_bearer"})
    if auth.split(" ",1)[1].strip().encode()!=SECRET: return self.send(403,{"ok":False,"error":"bad_secret"})
    wid=self.headers.get("X-Voidburn-Window-Id") or datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")+"-manual"
    os.makedirs(CHECKPOINT_DIR,exist_ok=True)
    path=os.path.join(CHECKPOINT_DIR,f"checkpoint-{wid}.json")
    body=json.dumps({"windowId":wid,"timestamp":datetime.utcnow().isoformat()+"Z"}).encode()
    open(path,"wb").write(body)
    self.send(200,{"ok":True,"validated":True,"resumeValidated":True,"format":"voidburn-checkpoint/v1","windowId":wid,"timestamp":datetime.utcnow().isoformat()+"Z","checkpointPath":path,"sizeBytes":len(body),"sha256":sha(body),"sigAlgo":"hmac-sha256","sig":sig(body)})
HTTPServer(("",PORT),H).serve_forever()
PY
  cat > "$EVIDENCE_DIR/kaniko_Dockerfile" <<'DOCKER'
FROM python:3.11-slim
RUN useradd -u 10001 -m app && mkdir -p /checkpoints && chown -R app:app /checkpoints
WORKDIR /app
COPY server.py /app/server.py
USER app
EXPOSE 8080
CMD ["python","/app/server.py"]
DOCKER

  kubectl -n voidburn-build create configmap vb-workload-buildctx \
    --from-file=server.py="$EVIDENCE_DIR/kaniko_server.py" \
    --from-file=Dockerfile="$EVIDENCE_DIR/kaniko_Dockerfile" >/dev/null

  cat <<YAML | kubectl apply -f - >/dev/null
apiVersion: batch/v1
kind: Job
metadata:
  name: kaniko-build-workload
  namespace: voidburn-build
spec:
  backoffLimit: 0
  template:
    spec:
      restartPolicy: Never
      containers:
      - name: kaniko
        image: gcr.io/kaniko-project/executor:v1.23.2
        args:
        - --dockerfile=/workspace/Dockerfile
        - --destination=$KANIKO_PUSH_IMAGE
        - --context=dir:///workspace
        volumeMounts:
        - name: workspace
          mountPath: /workspace
        - name: dockerconfig
          mountPath: /kaniko/.docker
      volumes:
      - name: workspace
        configMap:
          name: vb-workload-buildctx
      - name: dockerconfig
        secret:
          secretName: $KANIKO_REGISTRY_SECRET
          items:
          - key: config.json
            path: config.json
YAML
  kubectl -n voidburn-build wait --for=condition=complete job/kaniko-build-workload --timeout=10m >/dev/null || die "kaniko build failed"
  kubectl -n voidburn-build logs job/kaniko-build-workload > "$EVIDENCE_DIR/kaniko_build.log" || true

  WORKLOAD_IMAGE="$KANIKO_PUSH_IMAGE"
  verify_cosign "$WORKLOAD_IMAGE"
fi

cat <<YAML | kubectl apply -f - >/dev/null
apiVersion: apps/v1
kind: Deployment
metadata:
  name: $WORKLOAD_SERVICE
  namespace: $WORKLOAD_NAMESPACE
spec:
  replicas: 1
  selector:
    matchLabels: { app: $WORKLOAD_SERVICE }
  template:
    metadata:
      labels: { app: $WORKLOAD_SERVICE }
    spec:
      $PULL_SECRET_YAML
      containers:
      - name: workload
        image: $WORKLOAD_IMAGE
        imagePullPolicy: IfNotPresent
        ports: [{containerPort: 8080}]
        env:
        - name: PORT
          value: "8080"
        - name: CHECKPOINT_DIR
          value: "/checkpoints"
        - name: WORKLOAD_CHECKPOINT_SECRET
          valueFrom:
            secretKeyRef:
              name: voidburn-checkpoint-secret
              key: secret
        volumeMounts:
        - name: checkpoints
          mountPath: /checkpoints
      volumes:
      - name: checkpoints
        persistentVolumeClaim:
          claimName: checkpoint-pvc
---
apiVersion: v1
kind: Service
metadata:
  name: $WORKLOAD_SERVICE
  namespace: $WORKLOAD_NAMESPACE
spec:
  selector: { app: $WORKLOAD_SERVICE }
  ports:
  - name: http
    port: 8080
    targetPort: 8080
YAML

kubectl -n "$WORKLOAD_NAMESPACE" rollout status deploy/"$WORKLOAD_SERVICE" --timeout=3m >/dev/null || die "workload not Ready"

# ---------- step 10: strict smoke ----------
echo "[11] Strict checkpoint smoke"
CHECKPOINT_SECRET="$(kubectl -n "$WORKLOAD_NAMESPACE" get secret voidburn-checkpoint-secret -o jsonpath='{.data.secret}' | base64 -d)"
kubectl -n "$WORKLOAD_NAMESPACE" run vb-checkpoint-smoke --rm -i --restart=Never --image=curlimages/curl:8.6.0 -- \
  curl -fsS -X POST "http://voidburn-checkpoint.${WORKLOAD_NAMESPACE}.svc.cluster.local:8080/voidburn/trigger" \
    -H "Authorization: Bearer $CHECKPOINT_SECRET" \
    -H "X-Voidburn-Window-Id: manual-smoke" \
    -H "Content-Type: application/json" \
    --data "{\"reason\":\"manual_smoke\",\"windowSeconds\":$CHECKPOINT_WINDOW_SECONDS,\"windowId\":\"manual-smoke\"}" \
  | tee "$EVIDENCE_DIR/strict_smoke_response.json" >/dev/null \
  || die "strict smoke failed"

LAST="$(kubectl -n "$WORKLOAD_NAMESPACE" get configmap voidburn-checkpoint -o jsonpath='{.data.last_checkpoint}' 2>/dev/null || true)"
[[ -n "$LAST" ]] || die "marker missing"
PASS_STRICT_SMOKE="$pass"

# ---------- step 11: sentinel readiness + dry-run proof ----------
echo "[12] Sentinel readiness + dry-run proof (non-destructive)"
kubectl -n "$AGENT_NAMESPACE" rollout status deploy/sentinel-sentinel --timeout=3m >/dev/null || die "sentinel not Ready"
PASS_SENTINEL="$pass"

# ---------- step 11.5: IAM scoping simulation proof ----------
echo "[12.1] IAM scoping proof (simulation)"
SENTINEL_ROLE_ARN="$(kubectl -n "$AGENT_NAMESPACE" get sa sentinel-sentinel -o jsonpath='{.metadata.annotations.eks\\.amazonaws\\.com/role-arn}' 2>/dev/null || true)"
if [[ -n "$SENTINEL_ROLE_ARN" ]]; then
  echo "$SENTINEL_ROLE_ARN" > "$EVIDENCE_DIR/sentinel_role_arn.txt"
  ACCOUNT_ID="$($AWS sts get-caller-identity --query Account --output text 2>/dev/null || true)"
  if [[ -n "$ACCOUNT_ID" && "$ACCOUNT_ID" != "None" ]]; then
    TAGGED_ASG_SIM="$EVIDENCE_DIR/iam_sim_tagged_asg.json"
    UNTAGGED_ASG_SIM="$EVIDENCE_DIR/iam_sim_untagged_asg.json"
    TAGGED_SNAP_SIM="$EVIDENCE_DIR/iam_sim_tagged_snapshot.json"
    UNTAGGED_SNAP_SIM="$EVIDENCE_DIR/iam_sim_untagged_snapshot.json"

    # Tagged ASG: should ALLOW scale/terminate actions.
    if $AWS iam simulate-principal-policy \
      --policy-source-arn "$SENTINEL_ROLE_ARN" \
      --action-names autoscaling:UpdateAutoScalingGroup autoscaling:TerminateInstanceInAutoScalingGroup autoscaling:CompleteLifecycleAction \
      --resource-arns "*" \
      --context-entries \
        ContextKeyName=autoscaling:ResourceTag/voidburn.com/managed,ContextKeyValues=true,ContextKeyType=string \
        ContextKeyName=autoscaling:ResourceTag/voidburn.com/cluster,ContextKeyValues="$CLUSTER_NAME",ContextKeyType=string \
        ContextKeyName=autoscaling:ResourceTag/voidburn.com/target,ContextKeyValues=true,ContextKeyType=string \
        ContextKeyName="autoscaling:ResourceTag/kubernetes.io/cluster/$CLUSTER_NAME",ContextKeyValues=owned,ContextKeyType=string \
      --output json | tee "$TAGGED_ASG_SIM" >/dev/null; then
      :
    fi

    # Untagged ASG: should DENY the same actions.
    if $AWS iam simulate-principal-policy \
      --policy-source-arn "$SENTINEL_ROLE_ARN" \
      --action-names autoscaling:UpdateAutoScalingGroup autoscaling:TerminateInstanceInAutoScalingGroup autoscaling:CompleteLifecycleAction \
      --resource-arns "*" \
      --context-entries \
        ContextKeyName=autoscaling:ResourceTag/voidburn.com/managed,ContextKeyValues=false,ContextKeyType=string \
        ContextKeyName=autoscaling:ResourceTag/voidburn.com/cluster,ContextKeyValues="other-cluster",ContextKeyType=string \
        ContextKeyName=autoscaling:ResourceTag/voidburn.com/target,ContextKeyValues=false,ContextKeyType=string \
        ContextKeyName="autoscaling:ResourceTag/kubernetes.io/cluster/$CLUSTER_NAME",ContextKeyValues=owned,ContextKeyType=string \
      --output json | tee "$UNTAGGED_ASG_SIM" >/dev/null; then
      :
    fi

    # Tagged snapshot call: should ALLOW CreateSnapshot on tagged volumes.
    if $AWS iam simulate-principal-policy \
      --policy-source-arn "$SENTINEL_ROLE_ARN" \
      --action-names ec2:CreateSnapshot \
      --resource-arns "arn:aws:ec2:${AWS_REGION}:${ACCOUNT_ID}:volume/vol-0123456789abcdef0" \
      --context-entries \
        ContextKeyName=ec2:ResourceTag/voidburn.com/managed,ContextKeyValues=true,ContextKeyType=string \
        ContextKeyName=ec2:ResourceTag/voidburn.com/cluster,ContextKeyValues="$CLUSTER_NAME",ContextKeyType=string \
        ContextKeyName=ec2:ResourceTag/voidburn.com/target,ContextKeyValues=true,ContextKeyType=string \
      --output json | tee "$TAGGED_SNAP_SIM" >/dev/null; then
      :
    fi

    # Untagged snapshot call: should DENY.
    if $AWS iam simulate-principal-policy \
      --policy-source-arn "$SENTINEL_ROLE_ARN" \
      --action-names ec2:CreateSnapshot \
      --resource-arns "arn:aws:ec2:${AWS_REGION}:${ACCOUNT_ID}:volume/vol-0123456789abcdef0" \
      --context-entries \
        ContextKeyName=ec2:ResourceTag/voidburn.com/managed,ContextKeyValues=false,ContextKeyType=string \
        ContextKeyName=ec2:ResourceTag/voidburn.com/cluster,ContextKeyValues="other-cluster",ContextKeyType=string \
        ContextKeyName=ec2:ResourceTag/voidburn.com/target,ContextKeyValues=false,ContextKeyType=string \
      --output json | tee "$UNTAGGED_SNAP_SIM" >/dev/null; then
      :
    fi

    if [[ -s "$TAGGED_ASG_SIM" && -s "$UNTAGGED_ASG_SIM" && -s "$TAGGED_SNAP_SIM" && -s "$UNTAGGED_SNAP_SIM" ]] \
      && jq -e '[.EvaluationResults[].EvalDecision] | all(.=="allowed")' "$TAGGED_ASG_SIM" >/dev/null 2>&1 \
      && jq -e '[.EvaluationResults[].EvalDecision] | all(.!="allowed")' "$UNTAGGED_ASG_SIM" >/dev/null 2>&1 \
      && jq -e '[.EvaluationResults[].EvalDecision] | all(.=="allowed")' "$TAGGED_SNAP_SIM" >/dev/null 2>&1 \
      && jq -e '[.EvaluationResults[].EvalDecision] | all(.!="allowed")' "$UNTAGGED_SNAP_SIM" >/dev/null 2>&1; then
      PASS_IAM_SIM="$pass"
    else
      PASS_IAM_SIM="WARN"
      echo "WARN: IAM simulation evidence incomplete or unexpected decisions; see $EVIDENCE_DIR/iam_sim_*.json"
    fi
  else
    PASS_IAM_SIM="WARN"
    echo "WARN: could not resolve ACCOUNT_ID for IAM simulation"
  fi
else
  PASS_IAM_SIM="WARN"
  echo "WARN: could not resolve Sentinel role ARN from service account annotation"
fi

# Dry-run proof: no termination. We only capture evaluation readiness evidence.
if [[ -n "$ASG_NAME" ]]; then
  $AWS autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$ASG_NAME" > "$EVIDENCE_DIR/asg_before.json"
else
  echo '{}' > "$EVIDENCE_DIR/asg_before.json"
fi

kubectl get nodes \
  -L eks.amazonaws.com/nodegroup \
  -L karpenter.sh/nodepool \
  -L voidburn.com/target \
  -L voidburn.com/protected \
  -o wide > "$EVIDENCE_DIR/nodes_labels.txt"
kubectl -n "$AGENT_NAMESPACE" logs deploy/sentinel-sentinel --tail=250 > "$EVIDENCE_DIR/sentinel_tail.log" || true
kubectl -n "$WORKLOAD_NAMESPACE" logs deploy/voidburn-checkpoint --tail=250 > "$EVIDENCE_DIR/receiver_tail.log" || true
kubectl -n "$WORKLOAD_NAMESPACE" logs deploy/"$WORKLOAD_SERVICE" --tail=250 > "$EVIDENCE_DIR/workload_tail.log" || true

# Optional “pending pod” to show scaler evaluation without scaling (harmless)
kubectl -n default delete pod voidburn-dryrun-pending --ignore-not-found >/dev/null 2>&1 || true
cat <<'YAML' | kubectl apply -f - >/dev/null
apiVersion: v1
kind: Pod
metadata:
  name: voidburn-dryrun-pending
spec:
  restartPolicy: Never
  containers:
  - name: pause
    image: registry.k8s.io/pause:3.9
    resources:
      requests:
        cpu: "1000"
        memory: "1000Gi"
YAML
sleep 10
if [[ "$SCALER_TYPE" == "cluster-autoscaler" ]]; then
  kubectl -n kube-system logs deploy/cluster-autoscaler --tail=200 > "$EVIDENCE_DIR/autoscaler_dryrun_tail.log" 2>/dev/null || true
else
  if [[ -n "${KARPENTER_NS:-}" && -n "${KARPENTER_DEP:-}" ]]; then
    kubectl -n "$KARPENTER_NS" logs deploy/"$KARPENTER_DEP" --tail=200 > "$EVIDENCE_DIR/karpenter_dryrun_tail.log" 2>/dev/null || true
  fi
fi
kubectl -n default get pod voidburn-dryrun-pending -o yaml > "$EVIDENCE_DIR/dryrun_pending_pod.yaml" || true
kubectl -n default delete pod voidburn-dryrun-pending --ignore-not-found >/dev/null 2>&1 || true

if [[ -n "$ASG_NAME" ]]; then
  $AWS autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$ASG_NAME" > "$EVIDENCE_DIR/asg_after.json"
else
  echo '{}' > "$EVIDENCE_DIR/asg_after.json"
fi
PASS_DRYRUN="$pass"

# ---------- finalize evidence ----------
kubectl -n "$WORKLOAD_NAMESPACE" get pods,svc,pvc -o wide > "$EVIDENCE_DIR/workload_resources.txt" || true
kubectl -n "$AGENT_NAMESPACE" get pods -o wide > "$EVIDENCE_DIR/voidburn_pods.txt" || true

write_summary
tar -czf "${EVIDENCE_DIR}.tar.gz" "$EVIDENCE_DIR"

echo
echo "✅ COMPLETE"
echo "stack=$STACK_NAME"
echo "workloadImage=$WORKLOAD_IMAGE"
echo "evidence=${EVIDENCE_DIR}.tar.gz"
