#!/bin/bash # Set error handling, exit immediately on error set -e # Define colors for logging RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Logging functions log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2 } log_info() { echo -e "${BLUE}[INFO]${NC} $1" } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } # Function to check if command exists command_exists() { command -v "$1" >/dev/null 2>&1 } # Function to check port usage check_port() { local port=$1 if command_exists lsof; then lsof -i:"$port" >/dev/null 2>&1 elif command_exists netstat; then netstat -tuln | grep -q ":$port " elif command_exists ss; then ss -tuln | grep -q ":$port " else log_warning "Cannot check port $port - no suitable command found" return 1 fi } # Function to stop process on port stop_port_process() { local port=$1 if check_port "$port"; then log_info "Stopping process using port $port..." if command_exists lsof; then lsof -ti:"$port" | xargs kill -9 else # Alternative method using fuser if available if command_exists fuser; then fuser -k "$port/tcp" else log_warning "Cannot stop process on port $port - no suitable command found" return 1 fi fi sleep 2 # Verify the port is free if check_port "$port"; then log_warning "Port $port might still be in use" else log_success "Port $port is now free" fi else log_info "Port $port is not in use" fi } # Function to wait for port wait_for_port() { local port=$1 local max_wait=$2 local wait_interval=$3 local elapsed=0 while [ $elapsed -lt $max_wait ]; do if check_port "$port"; then log_success "Port $port started (waited ${elapsed} seconds)" return 0 fi sleep $wait_interval elapsed=$((elapsed + wait_interval)) log_info "Waiting for port $port... ${elapsed}/${max_wait} seconds" done log_error "Port $port did not start within ${max_wait} seconds" return 1 } # Main script starts here log_info "=== Start installation script: $(date) ===" # Validate required commands for cmd in wget tar; do if ! command_exists "$cmd"; then log_error "Required command '$cmd' not found. Please install it first." exit 1 fi done # Stop processes that may be using ports 8002 and 8003 log_info "Stopping processes that may be using ports 8002 and 8003..." stop_port_process 8002 stop_port_process 8003 # Download and install agent log_info "Starting agent installation..." cd ~ if [ -f "agent.tar.gz" ]; then log_info "Backing up existing agent.tar.gz" mv agent.tar.gz agent.tar.gz.old fi if ! wget -q https://tools.mindspore.cn/tools/ci/agent/agent.tar.gz --no-check-certificate; then log_error "Failed to download agent.tar.gz" exit 1 fi log_info "Extracting agent package..." rm -rf agent if ! tar zxf agent.tar.gz; then log_error "Failed to extract agent.tar.gz" exit 1 fi cd agent if [ ! -f "installAgent.sh" ]; then log_error "installAgent.sh not found in agent directory" exit 1 fi log_info "Running installAgent.sh..." bash installAgent.sh # Start watchdog cd /home/datavoice/agent/watchdog if [ ! -f "start.sh" ]; then log_error "start.sh not found in watchdog directory" exit 1 fi log_info "Starting watchdog service..." bash start.sh # Wait for services to start log_info "Waiting for services to fully start..." max_wait=60 wait_interval=5 if wait_for_port 8002 $max_wait $wait_interval && wait_for_port 8003 $max_wait $wait_interval; then log_success "All service ports started successfully" else log_error "One or more services failed to start" exit 1 fi # Set up scheduled task log_info "Setting up scheduled task..." # More reliable system detection method detect_os() { if [ -f /etc/os-release ]; then . /etc/os-release echo "$ID" elif [ -f /etc/redhat-release ]; then if grep -q "CentOS" /etc/redhat-release; then echo "centos" elif grep -q "Red Hat" /etc/redhat-release; then echo "rhel" else echo "unknown" fi elif [ -f /etc/lsb-release ]; then . /etc/lsb-release echo "$DISTRIB_ID" | tr '[:upper:]' '[:lower:]' else echo "unknown" fi } OS_TYPE=$(detect_os) log_info "Detected system type: $OS_TYPE" # Create logs directory if it doesn't exist mkdir -p /home/datavoice/agent/watchdog/logs # Set scheduled task based on system type case $OS_TYPE in ubuntu|debian) log_info "Ubuntu/Debian system, using cron service" SERVICE_NAME="cron" ;; centos|rhel|euler|openeuler|fedora) log_info "CentOS/RHEL/Euler/openEuler/Fedora system, using crond service" SERVICE_NAME="crond" ;; *) log_warning "Unknown system type, using basic setup" SERVICE_NAME="" ;; esac # Add cron job using a safer method CRON_JOB="0 2 * * * bash /home/datavoice/agent/watchdog/start.sh >> /home/datavoice/agent/watchdog/logs/start.log 2>&1" # Remove existing job if present, then add new one temp_file=$(mktemp) crontab -l 2>/dev/null | grep -v "start.sh" | grep -v "^#" > "$temp_file" || true echo "$CRON_JOB" >> "$temp_file" crontab "$temp_file" rm -f "$temp_file" # Reload cron service if available if [ -n "$SERVICE_NAME" ] && systemctl is-active --quiet "$SERVICE_NAME" 2>/dev/null; then systemctl reload "$SERVICE_NAME" log_success "Reloaded $SERVICE_NAME service" fi # Verify scheduled task log_info "Verifying scheduled task:" if crontab -l | grep -q "start.sh"; then log_success "Scheduled task added successfully" log_info "Current crontab:" crontab -l else log_error "Failed to add scheduled task after multiple attempts" exit 1 fi log_success "=== Script execution completed: $(date) ==="