#!/bin/bash # npu_auto_occupy.sh - Final fix: handle empty occupy_pid correctly LOG_FILE="/home/jenkins/.npu-scripts/npu_auto_occupy.log" START_SCRIPT="/home/jenkins/.npu-scripts/start_occupy.sh" STOP_SCRIPT="/home/jenkins/.npu-scripts/stop_occupy.sh" CHECK_INTERVAL=300 # 5 minutes (change back after debugging) log() { echo "[$(date)] $1" | tee -a "$LOG_FILE"; } while true; do occupy_pid=$(pgrep -f "occupy_npu_8cards.py" | head -1) # Extract all PIDs (remove header, empty lines, dashes) all_pids=$(npu-smi info 2>/dev/null | grep -A 20 "Process" | awk -F '|' '{print $3}' | \ grep -v Process | grep -v "^$" | grep -v "^ *$" | grep -v "^-") # Count processes other than the occupy script (critical fix: only exclude when occupy_pid is non-empty) if [ -n "$occupy_pid" ]; then other_count=$(echo "$all_pids" | grep -v -x "$occupy_pid" | grep -c .) else other_count=$(echo "$all_pids" | grep -c .) fi log "DEBUG: other_count=$other_count, occupy_pid=$occupy_pid" if [ "$other_count" -eq 0 ]; then # No other processes -> idle if ! pgrep -f "occupy_npu_8cards.py" > /dev/null; then log "Idle -> starting occupy" bash "$START_SCRIPT" else log "Idle -> occupy already running (keep)" fi else # Other processes exist -> busy if pgrep -f "occupy_npu_8cards.py" > /dev/null; then log "Busy ($other_count other processes) -> stopping occupy" bash "$STOP_SCRIPT" else log "Busy ($other_count other processes) -> no occupy running (OK)" fi fi sleep "$CHECK_INTERVAL" done