#!/usr/bin/env bash # monitor_pod_or_cronjob.sh # Two modes: # A) CronJob mode (give -c/--cronjob): create a Job from a CronJob and follow newest pod. # B) Pod-pattern mode (give one of --pod-exact/--pod-prefix/--pod-regex): wait for a pod that matches and follow THAT pod precisely. # # Output: <script_dir>/output/<YYYYmmdd-HHMMSS>_<name>/{pod.log,resources.csv} # Requirements: kubectl (metrics-server optional for CPU/mem numbers). set -euo pipefail # -------------------------- # Defaults (override by flags) # -------------------------- NAMESPACE="" CRONJOB_NAME="" # if set => CronJob mode JOB_PREFIX="" # default => CRONJOB_NAME POD_EXACT="" # if set => Pod-pattern mode (exact match) POD_PREFIX="" # if set => Pod-pattern mode (prefix match) POD_REGEX="" # if set => Pod-pattern mode (regex via grep -E) LOCK_ONCE="true" # if true, never switch to another pod after locking; set --follow-newer=false to change CONTAINER_NAME="" # specific container to stream logs from SAMPLE_INTERVAL=5 # seconds between resource samples WAIT_TIMEOUT=900 # seconds to wait for pod to appear KUBECTL="${KUBECTL:-kubectl}" TUE_SAT_ONLY="false" # -------------------------- # Internals # -------------------------- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" OUTPUT_ROOT="${SCRIPT_DIR}/output" TIMESTAMP="$(date +'%Y%m%d-%H%M%S')" ts() { date +'%Y-%m-%d %H:%M:%S'; } tsz() { date -u +'%Y-%m-%dT%H:%M:%SZ'; } usage() { cat <<EOF Usage: # CronJob mode (create Job and monitor) ${0##*/} -n <ns> -c <cronjob> [--job-prefix PFX] [--container NAME] [--interval SECS] [--wait SECS] [--follow-newer=true|false] # Pod-pattern mode (do NOT create anything; monitor a pod by name pattern) ${0##*/} -n <ns> [--pod-exact NAME | --pod-prefix PREFIX | --pod-regex REGEX] [--container NAME] [--interval SECS] [--wait SECS] [--follow-newer=false] Options: -n, --namespace NS Namespace (required in both modes) -c, --cronjob NAME CronJob name (CronJob mode) -j, --job-prefix PFX Prefix for created Job (default: cronjob name) --pod-exact NAME Exact pod name (Pod-pattern mode) --pod-prefix PREFIX Pod name starts with PREFIX (Pod-pattern mode) --pod-regex REGEX Pod name matches REGEX (Pod-pattern mode; grep -E) --container NAME Container to stream logs (default: first / all) --interval SECS Metrics sample interval (default: 5) --wait SECS Timeout waiting for pod (default: 900) --follow-newer=BOOL If true (default in CronJob mode), switch to newer pods when they appear. In Pod-pattern mode default is true for prefix/regex, false for exact. --tue-sat-only Skip on Sun/Mon -h, --help Examples: # Monitor a specific pod by exact name: ${0##*/} -n data --pod-exact importer-20251012-81kdb --interval 2 # Monitor the newest pod starting with a prefix: ${0##*/} -n data --pod-prefix importer- --interval 5 --follow-newer=false # Monitor by regex: ${0##*/} -n data --pod-regex '^importer-[0-9]{8}-[0-9]{6}-[a-z0-9]{5}$' # Old behavior: spawn from CronJob and monitor newest pod: ${0##*/} -n data -c daily-import --interval 5 EOF exit 1 } # -------------------------- # Parse args # -------------------------- FOLLOW_NEWER="__unset__" while [[ $# -gt 0 ]]; do case "$1" in -n|--namespace) NAMESPACE="$2"; shift 2;; -c|--cronjob) CRONJOB_NAME="$2"; shift 2;; -j|--job-prefix) JOB_PREFIX="$2"; shift 2;; --pod-exact) POD_EXACT="$2"; shift 2;; --pod-prefix) POD_PREFIX="$2"; shift 2;; --pod-regex) POD_REGEX="$2"; shift 2;; --container) CONTAINER_NAME="$2"; shift 2;; --interval) SAMPLE_INTERVAL="$2"; shift 2;; --wait) WAIT_TIMEOUT="$2"; shift 2;; --follow-newer=*) FOLLOW_NEWER="${1#*=}"; shift 1;; --tue-sat-only) TUE_SAT_ONLY="true"; shift;; -h|--help) usage;; *) echo "Unknown arg: $1" >&2; usage;; esac done # -------------------------- # Validations & mode detect # -------------------------- [[ -z "$NAMESPACE" ]] && { echo "[ERROR $(ts)] --namespace is required."; exit 2; } MODE="unknown" if [[ -n "$CRONJOB_NAME" ]]; then MODE="cronjob" elif [[ -n "$POD_EXACT" || -n "$POD_PREFIX" || -n "$POD_REGEX" ]]; then MODE="podpattern" else echo "[ERROR $(ts)] Specify either -c/--cronjob OR one of --pod-exact/--pod-prefix/--pod-regex."; exit 2 fi # FOLLOW_NEWER default per mode if [[ "$FOLLOW_NEWER" == "__unset__" ]]; then if [[ "$MODE" == "cronjob" ]]; then FOLLOW_NEWER="true" else # in pod-pattern mode: exact => false; prefix/regex => true if [[ -n "$POD_EXACT" ]]; then FOLLOW_NEWER="false"; else FOLLOW_NEWER="true"; fi fi fi # day gating optional if [[ "$TUE_SAT_ONLY" == "true" ]]; then dow=$(date +%u); if [[ "$dow" -eq 7 || "$dow" -eq 1 ]]; then echo "[INFO $(ts)] Skipping (Sun/Mon) due to --tue-sat-only."; exit 0; fi fi # cluster checks "$KUBECTL" version --request-timeout=5s >/dev/null 2>&1 || { echo "[ERROR $(ts)] kubectl cannot reach cluster."; exit 3; } "$KUBECTL" get ns "$NAMESPACE" >/dev/null 2>&1 || { echo "[ERROR $(ts)] Namespace '$NAMESPACE' not found or no access."; exit 4; } # -------------------------- # Utility functions # -------------------------- mem_to_bytes() { local v="$1" case "$v" in *Ki) echo $(( ${v%Ki} * 1024 ));; *Mi) echo $(( ${v%Mi} * 1024 * 1024 ));; *Gi) echo $(( ${v%Gi} * 1024 * 1024 * 1024 ));; *Ti) echo $(( ${v%Ti} * 1024 * 1024 * 1024 * 1024 ));; *K) echo $(( ${v%K} * 1000 ));; *M) echo $(( ${v%M} * 1000 * 1000 ));; *G) echo $(( ${v%G} * 1000 * 1000 * 1000 ));; *T) echo $(( ${v%T} * 1000 * 1000 * 1000 * 1000 ));; *) echo "$v";; esac } get_newest_pod_by_job() { "$KUBECTL" -n "$NAMESPACE" get pods -l "job-name=${JOB_NAME}" \ --sort-by=.metadata.creationTimestamp \ -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null | tail -n 1 } get_pod_match() { # returns newest matching pod name for the chosen pattern mode local list list="$("$KUBECTL" -n "$NAMESPACE" get pods \ --sort-by=.metadata.creationTimestamp \ -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)" [[ -z "$list" ]] && return 1 if [[ -n "$POD_EXACT" ]]; then echo "$list" | awk -v n="$POD_EXACT" '$0==n' | tail -n 1 elif [[ -n "$POD_PREFIX" ]]; then echo "$list" | awk -v p="$POD_PREFIX" 'index($0,p)==1' | tail -n 1 else echo "$list" | grep -E -- "$POD_REGEX" | tail -n 1 fi } pod_waiting_reason() { "$KUBECTL" -n "$NAMESPACE" get pod "$1" -o jsonpath='{.status.containerStatuses[0].state.waiting.reason}' 2>/dev/null || true } # -------------------------- # Output folder & filenames # -------------------------- NAME_FOR_FOLDER="" if [[ "$MODE" == "cronjob" ]]; then NAME_FOR_FOLDER="${CRONJOB_NAME}" else NAME_FOR_FOLDER="${POD_EXACT:-${POD_PREFIX:-${POD_REGEX//[^A-Za-z0-9_-]/_}}}" fi RUN_DIR="${OUTPUT_ROOT}/${TIMESTAMP}_${NAME_FOR_FOLDER}" mkdir -p "$RUN_DIR" POD_LOG="${RUN_DIR}/pod.log" RES_CSV="${RUN_DIR}/resources.csv" echo "[INFO $(ts)] Output folder: $RUN_DIR" echo "ts_utc,container,cpu_m,mem_bytes,node_cpu_pct,node_mem_bytes" > "$RES_CSV" # -------------------------- # CronJob mode: create Job # -------------------------- POD_NAME="" NODE_NAME="" JOB_NAME="" if [[ "$MODE" == "cronjob" ]]; then "$KUBECTL" -n "$NAMESPACE" get cronjob "$CRONJOB_NAME" >/dev/null 2>&1 || { echo "[ERROR $(ts)] CronJob '$CRONJOB_NAME' not found."; exit 5; } JOB_PREFIX="${JOB_PREFIX:-$CRONJOB_NAME}" JOB_NAME="${JOB_PREFIX}-${TIMESTAMP}" echo "[INFO $(ts)] Creating Job '$JOB_NAME' from CronJob '$CRONJOB_NAME' in ns '$NAMESPACE'..." | tee -a "$POD_LOG" "$KUBECTL" -n "$NAMESPACE" create job --from=cronjob/"$CRONJOB_NAME" "$JOB_NAME" >/dev/null \ || { echo "[ERROR $(ts)] Failed to create Job."; exit 6; } fi # -------------------------- # Wait for the target pod # -------------------------- echo "[INFO $(ts)] Waiting for target pod..." | tee -a "$POD_LOG" DEADLINE=$(( $(date +%s) + WAIT_TIMEOUT )) while [[ -z "$POD_NAME" && $(date +%s) -lt $DEADLINE ]]; do if [[ "$MODE" == "cronjob" ]]; then POD_NAME="$(get_newest_pod_by_job)" else POD_NAME="$(get_pod_match)" fi [[ -z "$POD_NAME" ]] && sleep 1 done [[ -z "$POD_NAME" ]] && { echo "[ERROR $(ts)] Timed out waiting for pod." | tee -a "$POD_LOG"; exit 7; } NODE_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)" echo "[INFO $(ts)] Monitoring pod: $POD_NAME | Node: ${NODE_NAME:-unknown}" | tee -a "$POD_LOG" # pick container hint if [[ -z "$CONTAINER_NAME" ]]; then CONTAINER_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.containers[0].name}' 2>/dev/null || true)" fi echo "[INFO $(ts)] Container (hint): ${CONTAINER_NAME:-<all>}" | tee -a "$POD_LOG" # Metrics probe METRICS_AVAILABLE=true if ! "$KUBECTL" -n "$NAMESPACE" top pod "$POD_NAME" >/dev/null 2>&1; then echo "[WARN $(ts)] Metrics API not available; resources.csv will have blanks." | tee -a "$POD_LOG" METRICS_AVAILABLE=false fi # -------------------------- # Start log streaming # -------------------------- { echo "[INFO $(ts)] --- BEGIN POD LOG STREAM ---" if [[ -n "$CONTAINER_NAME" ]]; then "$KUBECTL" -n "$NAMESPACE" logs -f "pod/${POD_NAME}" -c "$CONTAINER_NAME" --timestamps else "$KUBECTL" -n "$NAMESPACE" logs -f "pod/${POD_NAME}" --all-containers --timestamps fi echo "[INFO $(ts)] --- END POD LOG STREAM ---" } >> "$POD_LOG" 2>&1 & LOG_PID=$! # -------------------------- # Sampling loop # -------------------------- stop_sampling="false" sample_loop() { while [[ "$stop_sampling" != "true" ]]; do # optionally follow newer pod if requested (for cronjob / prefix / regex cases) if [[ "$FOLLOW_NEWER" == "true" ]]; then if [[ "$MODE" == "cronjob" ]]; then latest="$(get_newest_pod_by_job)" else # exact => stay; prefix/regex => can move if [[ -n "$POD_EXACT" ]]; then latest="$POD_NAME"; else latest="$(get_pod_match)"; fi fi if [[ -n "$latest" && "$latest" != "$POD_NAME" ]]; then POD_NAME="$latest" NODE_NAME="$("$KUBECTL" -n "$NAMESPACE" get pod "$POD_NAME" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true)" echo "[INFO $(ts)] Switched to pod: $POD_NAME | Node: ${NODE_NAME:-unknown}" >> "$POD_LOG" fi fi TS="$(tsz)" NODE_CPU="" NODE_MEM_B="" if [[ "$METRICS_AVAILABLE" == true && -n "$NODE_NAME" ]]; then if read -r _ _ cpu_pct mem_raw _ < <("$KUBECTL" top node "$NODE_NAME" 2>/dev/null | awk 'NR==2{print $1,$2,$3,$4,$5}'); then NODE_CPU="${cpu_pct%%%}" NODE_MEM_B="$(mem_to_bytes "$mem_raw")" fi fi if [[ "$METRICS_AVAILABLE" == true && -n "$POD_NAME" ]]; then while read -r pod ctnr cpu_raw mem_raw _; do [[ -z "$ctnr" ]] && continue if [[ "$cpu_raw" == *m ]]; then CPU_M="${cpu_raw%m}"; else CPU_M=$(awk -v v="$cpu_raw" 'BEGIN{printf "%.0f", v*1000}'); fi MEM_B="$(mem_to_bytes "$mem_raw")" echo "$TS,$ctnr,$CPU_M,$MEM_B,${NODE_CPU},${NODE_MEM_B}" >> "$RES_CSV" done < <("$KUBECTL" -n "$NAMESPACE" top pod "$POD_NAME" --containers 2>/dev/null | awk 'NR>1{print $1,$2,$3,$4,$5}') else echo "$TS,,,,," >> "$RES_CSV" fi sleep "$SAMPLE_INTERVAL" done } sample_loop & SAMP_PID=$! # -------------------------- # Watch for end # -------------------------- PHASE="" while true; do # If following newer, re-evaluate current pod for phase; otherwise lock on POD_NAME TARGET="$POD_NAME" PHASE="$("$KUBECTL" -n "$NAMESPACE" get pod "$TARGET" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")" if [[ "$PHASE" == "Succeeded" || "$PHASE" == "Failed" || "$PHASE" == "NotFound" ]]; then break fi # Early exit on hard waiting reasons REASON="$(pod_waiting_reason "$TARGET" || true)" case "$REASON" in ImagePullBackOff|ErrImagePull|CreateContainerConfigError|CrashLoopBackOff|CreateContainerError) echo "[ERROR $(ts)] Pod waiting reason: $REASON" | tee -a "$POD_LOG" break;; esac sleep 3 done # -------------------------- # Stop background tasks # -------------------------- stop_sampling="true" kill "$SAMP_PID" >/dev/null 2>&1 || true kill "$LOG_PID" >/dev/null 2>&1 || true wait "$SAMP_PID" "$LOG_PID" 2>/dev/null || true # -------------------------- # Final messages # -------------------------- echo "[INFO $(ts)] Pod final phase: ${PHASE:-unknown}" | tee -a "$POD_LOG" echo "[INFO $(ts)] Output saved to: $RUN_DIR" | tee -a "$POD_LOG" For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)