title: 前置检查脚本 author: Gamehu date: 2024-08-18 22:19:13 tags: ---
离职系列 第八篇
离职系列,想想这几年在公司的成长,在这做个记录。
此篇是因为遇到了太多环境类问题,从LMT建立的数据统计,这类问题占比已经超过了我处理问题的60%左右,占所有现场问题的20%左右以上,所以抽了个时间把可以在前置检查中避免的问题都梳理出来,试着写了个脚本,并一次次到现场验证,最终有了以下版本。该脚本在产品没有出根解之前,直接交给了TAC,让其与一线一起前置处理,避免过多的流转到LMT。 ## 检查脚本, ``` #!/bin/bash # 环境前置检查脚本(用于提前感知问题,减少安装失败、升级失败等问题出现) # by hht # 上传后,chmod +x 授予执行权限,然后./pre_check.sh 直接执行 # 输出错误到文件 exec 2>/tmp/pre_test_error.log # ANSI颜色代码 GREEN='\033[0;32m' RED='\033[0;31m' NC='\033[0m' # 恢复默认颜色 YELLOW='\033[0;33m' YELLOW_BG='\033[43m' BLACK='\033[30m' # 黑色字体 none='\e[0m' BLUE="\e[0;94m" _red_bg() { echo -e "\e[41m$@${none}"; } is_err=$(_red_bg 异常!) warn() { echo -e "\n${YELLOW_BG}${BLACK}警告!${NC} $@\n" } err() { echo -e "\n$@ $is_err\n" } # 函数来检验IP地址的有效性 is_valid_ip() { local ip="$1" local ip_regex="^([0-9]{1,3}\.){3}[0-9]{1,3}$" if [[ $ip =~ $ip_regex ]]; then return 0 else return 1 fi } # 声明账号 server1_user="root" server2_user="root" echo -e "\n${GREEN}------------------------------------前置检查------------------------------------${NC}" echo # 输入服务器IP地址,进行校验 while true; do echo "请输入master服务器的IP地址:" read server1_ip if is_valid_ip "$server1_ip"; then break else warn "无效的IP地址,请重新输入。" fi done while true; do echo "请输入worker服务器的IP地址:" read server2_ip if is_valid_ip "$server2_ip"; then break else warn "无效的IP地址,请重新输入。" fi done # 测试Ping echo -e "\n${BLUE}ping测试:${NC}\n" ping_check(){ local server1_ip=$1 local server2_ip=$2 ping -c 3 $server1_ip > /dev/null 2>&1 if [ $? -eq 0 ]; then echo -e "$server1_ip 可以Ping通。${GREEN}正常${NC}" else err "$server1_ip 无法Ping通。" fi ping -c 3 $server2_ip > /dev/null 2>&1 if [ $? -eq 0 ]; then echo -e "$server2_ip 可以Ping通。${GREEN}正常${NC}" else err "$server2_ip 无法Ping通。" && exit 1 fi } ping_check $server1_ip $server2_ip echo -e "\n------------------------------------------------------------------------------" # SSH测试 echo -e "\n${BLUE}SSH测试:${NC}\n" ssh_check(){ local server1_ip=$1 local server2_ip=$2 if sshpass timeout 10s ssh -o StrictHostKeyChecking=no root@$server1_ip echo "SSH test" 2>/dev/null; then echo -e "Success: SSH from $server1_ip to $server2_ip connected.${GREEN}正常${NC}" else err "SSH from $server1_ip to $server2_ip failed" fi } ssh_check $server1_ip $server2_ip echo -e "\n------------------------------------------------------------------------------" # 时间一致性检查 echo -e "\n${BLUE}时间一致性检查:${NC}\n" date_check(){ local server1_ip=$1 local server2_ip=$2 # 时间一致性检查 server1_time=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@$server1_ip date +%s) server2_time=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@$server2_ip date +%s) if [ $server1_time -eq $server2_time ]; then echo -e "两台服务器的时间完全一致。${GREEN}正常${NC}" else # 使用 date 命令将时间戳转换为日期和时间 formatted1_time=$(date -d "@$server1_time") formatted2_time=$(date -d "@$server2_time") err "两台服务器的时间不一致。" echo -e "${server1_ip}时间为:${formatted1_time}" echo -e "${server2_ip}时间为:${formatted2_time}" fi } date_check $server1_ip $server2_ip echo -e "\n------------------------------------------------------------------------------" # 添加检查防火墙是否开启 echo -e "\n${BLUE}检查防火墙:${NC}\n" check_firewall_status() { local server_ip=$1 # 使用 ssh 连接到服务器并查看 firewalld 服务的状态 firewall_status=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "root@$server_ip" "systemctl is-active firewalld") if [ "$firewall_status" = "active" ]; then warn "$server_ip 防火墙已开启。请根据http://172.17.160.32:18090/x/cYA0CQ检查端口" else echo "$server_ip 防火墙未开启。" fi } # 调用这个函数并传入服务器 IP 和用户名 check_firewall_status $server1_ip check_firewall_status $server2_ip echo -e "\n------------------------------------------------------------------------------" # DNS配置检查 echo -e "\n${BLUE}DNS检查:${NC}\n" dns_check(){ local server1_user=$1 local server2_user=$1 local server1_ip=$2 local server2_ip=$3 # DNS配置检查 - 验证是否一致 server1_dns=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server1_user@$server1_ip cat /etc/resolv.conf) server2_dns=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server2_user@$server2_ip cat /etc/resolv.conf) if [ "$server1_dns" == "$server2_dns" ]; then echo -e "两台服务器的DNS配置一致。${GREEN}正常${NC}" else warn "两台服务器的DNS配置不一致。请判断是否影响集群。" fi # DNS配置检查 - 验证是否存在多行nameserver记录 server1_nameserver_count=$(echo "$server1_dns" | grep -c '^nameserver') server2_nameserver_count=$(echo "$server2_dns" | grep -c '^nameserver') if [ $server1_nameserver_count -eq 1 ] && [ $server2_nameserver_count -eq 1 ]; then echo -e "两台服务器的DNS配置中只存在一行nameserver记录。${GREEN}正常${NC}" else if [ $server1_nameserver_count -ne 1 ]; then warn "第一台服务器($server1_ip)的DNS配置存在多行nameserver记录。请判断是否影响集群。" fi if [ $server2_nameserver_count -ne 1 ]; then warn "第二台服务器($server2_ip)的DNS配置存在多行nameserver记录。请判断是否影响集群。" fi fi } dns_check $server1_user $server1_ip $server2_ip echo -e "\n------------------------------------------------------------------------------" # 挂载检查 echo -e "\n${BLUE}数据盘挂载检查:${NC}\n" mount_check(){ local server1_user=$1 local server2_user=$1 local server1_ip=$2 local server2_ip=$3 server1_mount=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server1_user@$server1_ip mount | grep /opt/local-path-provisioner) server2_mount=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server2_user@$server2_ip mount | grep /opt/local-path-provisioner) if [ -n "$server1_mount" ] && [ -n "$server2_mount" ]; then echo -e "两台服务器都正确挂载了/opt/local-path-provisioner目录。${GREEN}正常${NC}" else # echo -e "两台服务器中有一台或两台未正确挂载/opt/local-path-provisioner目录。${RED}异常${NC}" if [ -z "$server1_mount" ]; then err "第一台服务器($server1_ip)未正确挂载/opt/local-path-provisioner目录。" fi if [ -z "$server2_mount" ]; then err "第二台服务器($server2_ip)未正确挂载/opt/local-path-provisioner目录。" fi fi } mount_check $server1_user $server1_ip $server2_ip echo -e "\n------------------------------------------------------------------------------" # 使用Telnet检查SFTP服务是否联通 # check_sftp() { # local server_ip=$1 # echo -e "\n-----------使用Telnet检查SFTP服务是否联通-------------" # # # 使用Telnet连接到SSH端口(默认是22) # # # 尝试连接SFTP服务器 # # sftp -oPort=22 $server_ip </dev/null # # quit # # EOF # # # Check the exit status of the SFTP command # # if [ $? -eq 0 ]; then # # echo "SFTP on $IP is working normally" # # else # # echo "SFTP on $IP is not working!" # # fi # # 使用SSH命令测试SFTP服务 # ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null sftpuser@$server_ip sftp exit > /dev/null 2>&1 # # 检查SSH命令的退出状态码 # if [ $? -eq 0 ]; then # echo "SFTP服务正常,可以连接到主机 $server_ip 的SFTP服务。" # else # err "SFTP服务异常,无法连接到主机 $server_ip 的SFTP服务。" # fi # } # # 调用函数来进行Telnet检查 # check_sftp $server_ip # 检查master SSH服务状态和配置 echo -e "\n${BLUE}master SSH服务状态和配置检查:${NC}\n" check_sshd_config() { local server_ip=$1 local server_user=$2 # 使用 ssh 连接到服务器并执行命令检查SSH服务状态 ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "systemctl is-active sshd" > /dev/null 2>&1 if [ $? -eq 0 ]; then # 获取SSH配置文件内容 sshd_config_content=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "cat /etc/ssh/sshd_config") # 检查配置文件内容是否包含指定的三行数据 if echo "$sshd_config_content" | grep -q -E "Subsystem\s+sftp\s+internal-sftp" && echo "$sshd_config_content" | grep -q "Match User sftpuser" && echo "$sshd_config_content" | grep -q "ChrootDirectory /opt/ftpfile/sftp/sftpuser/"; then echo -e "SSH配置正常。${GREEN}正常${NC}" else warn "请检查sshd_config文件,是否正确配置sftp。" fi else warn "无法连接服务器或检查SSH服务状态。" fi } # 调用函数执行检查 check_sshd_config $server1_ip $server1_user echo -e "\n------------------------------------------------------------------------------" # sftp_with_password() { # echo -e "\n-----------检查master的sftp连通性-------------" # local server_ip=$1 # local server_password=$2 # # 使用 expect 来自动输入密码 # expect -c " # spawn sftp -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null sftpuser@$server_ip # expect { # \"password:\" { # send \"$server_password\n\" # exp_continue # } # eof # } # " # if [ $? -eq 0 ]; then # echo -e "$server_ip SFTP连接成功。${GREEN}正常${NC}" # else # warn "无法连接$server_ip 的SFTP服务。" # fi # } # # 调用函数来进行SFTP连接检查 # sftp_with_password $server1_ip "FYktvR1w2upoOb" # 检查目录权限是否为777 # check_directory_permission() { # local server_ip=$1 # local directory_path=$2 # echo -e "\n-----------检查master的目录${directory_path}权限-------------" # # 使用 ssh 连接到服务器并执行 stat 命令获取目录权限信息 # ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null sftpuser@$server_ip "stat -c %a $directory_path" > /dev/null 2>&1 # if [ $? -eq 0 ]; then # # 获取目录权限 # directory_permission=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null sftpuser@$server_ip "stat -c %a $directory_path") # if [ "$directory_permission" -eq 777 ]; then # echo -e "$directory_path 目录权限为777。${GREEN}正常${NC}" # else # warn "$directory_path 目录权限不是777。" # fi # else # warn "无法连接$server_ip 服务器或获取目录权限。" # fi # } echo -e "\n${BLUE}检查master的sftp目录权限:${NC}\n" check_user() { ssh $1 "id $2 >/dev/null 2>&1" if [ $? -eq 0 ]; then echo -e "$2用户在$1上存在。${GREEN}正常${NC}" else err "$2用户在$1上不存在" fi } check_dir_perm() { ssh $1 "if [ \`stat -c %a $2\` -eq $3 ]; then echo -e '$2目录为$3权限。${GREEN}正常${NC}'; else err '$2目录不为$3权限'; fi" } # 调用函数来检查服务器的目录权限 check_user $server1_ip "sftpuser" check_dir_perm $server1_ip "/opt" 755 check_dir_perm $server1_ip "/opt/ftpfile/sftp/sftpuser" 755 echo -e "\n------------------------------------------------------------------------------" # check_directory_permission $server1_ip "/opt" # check_directory_permission $server1_ip "/opt/ftpfile/sftp/sftpuser" # 检查主机名与/etc/hosts文件的一致性 echo -e "\n${BLUE}检查主机名的一致性:${NC}\n" check_hostname_and_hosts() { local server_ip=$1 local server_user=$2 # 使用 ssh 连接到服务器并获取主机名 server_hostname=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "hostname") if [ $? -eq 0 ]; then # 获取 /etc/hosts 文件内容 hosts_file_content=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "cat /etc/hosts") # 检查主机名与 /etc/hosts 内是否一致 if echo "$hosts_file_content" | grep -q -E "^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\s+$server_hostname\s*$"; then echo -e "$server_ip 主机名与/etc/hosts文件一致。${GREEN}正常${NC}" else warn "$server_ip 主机名与/etc/hosts文件不一致。" fi else warn "无法连接$server_ip 服务器或获取主机名。" fi } # 检查主机名与Kubernetes节点名是否一致 check_hostname_and_k8s_node() { local server_ip=$1 local server_user=$2 local num=$3 # 使用 ssh 连接到服务器并获取Kubernetes节点名 k8s_node_name=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "kubectl get node -o jsonpath='{.items[$num].metadata.name}'") if [ $? -eq 0 ]; then # 获取主机名 server_hostname=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "hostname") # 检查主机名与Kubernetes节点名是否一致 if [ "$server_hostname" = "$k8s_node_name" ]; then echo -e "$server_ip 主机名与Kubernetes节点名一致。${GREEN}正常${NC}" else warn "$server_ip 主机名与Kubernetes节点名不一致。" fi else warn "无法连接$server_ip 服务器或获取Kubernetes节点名。" fi } # 调用函数来检查服务器的主机名与/etc/hosts文件一致性 check_hostname_and_hosts $server1_ip $server1_user check_hostname_and_hosts $server2_ip $server2_user # 调用函数来检查服务器的主机名与Kubernetes节点名一致性 check_hostname_and_k8s_node $server1_ip $server1_user 0 check_hostname_and_k8s_node $server2_ip $server2_user 1 echo -e "\n------------------------------------------------------------------------------" # 磁盘写入速度测试(带缓存) # disk_speed_test() { # # local server_ip=$1 # # local server_user=$2 # # local test_file="/tmp/disk_speed_test_file" # # # 进行写入测试 # # write_speed=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "dd if=/dev/zero of=${test_file} bs=8k count=128 conv=fsync oflag=direct" 2>&1 | tail -n 1) # # echo -e "${server_ip} 写入速度: $write_speed" # # # 进行读取测试 # # read_speed=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "sudo dd if=${test_file} of=/dev/null bs=8k count=128 conv=fsync iflag=direct" 2>&1 | tail -n 1) # # echo -e "${server_ip} 读取速度: $read_speed" # # # 删除测试文件 # # ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "rm $test_file" # } # 测试磁盘读速度的函数 disk_speed_test() { local server_ip=$1 local test_duration=30 # 测试持续时间(秒) echo "开始测试 $server_ip 过程会持续30s..." local time0=$(date "+%s") cat /dev/null > disk_res while ((($(date "+%s") - time0) <= test_duration)); do disk_info=$(ssh "$server_ip" 'dd if=/dev/zero of=output.file bs=8k count=128 conv=fsync 2>&1 1>/dev/null') io_res=$(echo "$disk_info" | grep --only-matching -E '[0-9.]+ ([MGk]?B|bytes)/[s(ec)?|秒]') echo "$io_res" >> disk_res done local count=$(cat disk_res | wc -l) local sum=$(cat disk_res | xargs -n2 | awk '{ if ($2 == "kB/秒" || $2 == "kB/s") a+=($1/1024); else a+=$1 } END{printf("%.2f", a)}') local average_speed=$(awk 'BEGIN{printf "%.2f\n", '$sum'/'$count'}') echo -e "平均速度 $server_ip: ${RED}$average_speed MB/s${NC}。推荐值:${GREEN}>=200m/s${NC}" } # 磁盘写入速度测试(不带缓存) # disk_speed_test_no_cache() { # local server_ip=$1 # local server_user=$2 # local test_file="/tmp/disk_speed_test_file" # # 禁用磁盘缓存 # ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "hdparm -W0 /dev/mapper/centos-root" 2>/dev/null # # 进行写入测试 # write_speed=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "dd if=/dev/zero of=${test_file} bs=8k count=128 conv=fsync oflag=direct" 2>&1 | tail -n 1) # echo -e "${server_ip} 写入速度: $write_speed" # # 进行读取测试 # read_speed=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "sudo dd if=${test_file} of=/dev/null bs=8k count=128 conv=fsync iflag=direct" 2>&1 | tail -n 1) # echo -e "${server_ip} 读取速度: $read_speed" # # 删除测试文件 # ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "rm $test_file" # # 启用磁盘缓存 # ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "hdparm -W1 /dev/mapper/centos-root" 2>/dev/null # } # 测试磁盘写入速度的函数 disk_speed_test_no_cache() { local server_ip="$1" local test_duration=30 # 测试持续时间(秒) echo "开始测试 $server_ip 过程会持续30s..." local time0=$(date "+%s") cat /dev/null > disk_res while ((($(date "+%s") - time0) <= test_duration)); do disk_info=$(ssh "$server_ip" 'dd if=/dev/zero of=output.file bs=8k count=128 oflag=direct,nonblock conv=fsync 2>&1 1>/dev/null') io_res=$(echo "$disk_info" | grep --only-matching -E '[0-9.]+ ([MGk]?B|bytes)/[s(ec)?|秒]') echo "$io_res" >> disk_res done local count=$(cat disk_res | wc -l) local sum=$(cat disk_res | xargs -n2 | awk '{ if ($2 == "kB/秒" || $2 == "kB/s") a+=($1/1024); else a+=$1 } END{printf("%.2f", a)}') local average_speed=$(awk 'BEGIN{printf "%.2f\n", '$sum'/'$count'}') echo -e "平均速度 $server_ip: ${RED} $average_speed MB/s ${NC}。推荐值:${GREEN}>=50m/s${NC}" } # 检查 CPU 是否支持 AVX echo -e "\n${BLUE}检查 CPU 是否支持 AVX:${NC}\n" check_avx_support() { local server_ip=$1 local avx_check=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@$server_ip "grep -o 'avx' /proc/cpuinfo") if [ -n "$avx_check" ]; then echo -e "$server_ip CPU 支持 AVX (Advanced Vector Extensions)。${GREEN}正常${NC}" else warn "$server_ip CPU 不支持 AVX (Advanced Vector Extensions)。" fi } # 调用函数来检查 AVX 支持 check_avx_support $server1_ip check_avx_support $server2_ip # 定义要加载镜像的目录列表 load_images() { image_directories=("/opt/xx/images/product/insight" "/opt/xx/images/product/insight/patch") # 遍历目录并加载镜像 for directory in "${image_directories[@]}"; do if [ -d "$directory" ]; then cd "$directory" echo "进入目录: $directory" for file in $(ls . | grep .tgz); do echo "加载镜像: $file" docker load < "$file" done else echo "目录 $directory 不存在。" fi done } # 检查磁盘性能 echo -e "\n${BLUE}磁盘测试:${NC}\n" # read -p "是否要检查磁盘? (y/n): " confirm echo "是否要检查磁盘? (y/n):" read confirm if [ "$confirm" = "y" ]; then echo -e "\n${BLUE}磁盘写入速度测试(不带缓存):${NC}\n" disk_speed_test_no_cache $server1_ip disk_speed_test_no_cache $server2_ip echo -e "\n${BLUE}磁盘写入速度测试(带缓存):${NC}\n" disk_speed_test $server1_ip disk_speed_test $server2_ip else echo "取消磁盘检查。" fi # 镜像丢的时候使用 echo -e "\n${BLUE}镜像加载:${NC}\n" echo "是否要加载 Docker 镜像? (y/n):" read confirm # read -p "是否要加载 Docker 镜像? (y/n): " confirm if [ "$confirm" = "y" ]; then load_images else echo "取消加载 Docker 镜像。" fi ```