title: 前置检查脚本 author: Gamehu date: 2024-08-18 22:19:13
此篇是因为遇到了太多环境类问题,从LMT建立的数据统计,这类问题占比已经超过了我处理问题的60%左右,占所有现场问题的20%左右以上,所以抽了个时间把可以在前置检查中避免的问题都梳理出来,试着写了个脚本,并一次次到现场验证,最终有了以下版本。该脚本在产品没有出根解之前,直接交给了TAC,让其与一线一起前置处理,避免过多的流转到LMT。
```
#!/bin/bash
# 环境前置检查脚本(用于提前感知问题,减少安装失败、升级失败等问题出现)
# by hht
# 上传后,chmod +x 授予执行权限,然后./pre_check.sh 直接执行
# 输出错误到文件
exec 2>/tmp/pre_test_error.log
# ANSI颜色代码
GREEN='\033[0;32m'
RED='\033[0;31m'
NC='\033[0m' # 恢复默认颜色
YELLOW='\033[0;33m'
YELLOW_BG='\033[43m'
BLACK='\033[30m' # 黑色字体
none='\e[0m'
BLUE="\e[0;94m"
_red_bg() { echo -e "\e[41m$@${none}"; }
is_err=$(_red_bg 异常!)
warn() {
echo -e "\n${YELLOW_BG}${BLACK}警告!${NC} $@\n"
}
err() {
echo -e "\n$@ $is_err\n"
}
# 函数来检验IP地址的有效性
is_valid_ip() {
local ip="$1"
local ip_regex="^([0-9]{1,3}\.){3}[0-9]{1,3}$"
if [[ $ip =~ $ip_regex ]]; then
return 0
else
return 1
fi
}
# 声明账号
server1_user="root"
server2_user="root"
echo -e "\n${GREEN}------------------------------------前置检查------------------------------------${NC}"
echo
# 输入服务器IP地址,进行校验
while true; do
echo "请输入master服务器的IP地址:"
read server1_ip
if is_valid_ip "$server1_ip"; then
break
else
warn "无效的IP地址,请重新输入。"
fi
done
while true; do
echo "请输入worker服务器的IP地址:"
read server2_ip
if is_valid_ip "$server2_ip"; then
break
else
warn "无效的IP地址,请重新输入。"
fi
done
# 测试Ping
echo -e "\n${BLUE}ping测试:${NC}\n"
ping_check(){
local server1_ip=$1
local server2_ip=$2
ping -c 3 $server1_ip > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo -e "$server1_ip 可以Ping通。${GREEN}正常${NC}"
else
err "$server1_ip 无法Ping通。"
fi
ping -c 3 $server2_ip > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo -e "$server2_ip 可以Ping通。${GREEN}正常${NC}"
else
err "$server2_ip 无法Ping通。" && exit 1
fi
}
ping_check $server1_ip $server2_ip
echo -e "\n------------------------------------------------------------------------------"
# SSH测试
echo -e "\n${BLUE}SSH测试:${NC}\n"
ssh_check(){
local server1_ip=$1
local server2_ip=$2
if sshpass timeout 10s ssh -o StrictHostKeyChecking=no root@$server1_ip echo "SSH test" 2>/dev/null; then
echo -e "Success: SSH from $server1_ip to $server2_ip connected.${GREEN}正常${NC}"
else
err "SSH from $server1_ip to $server2_ip failed"
fi
}
ssh_check $server1_ip $server2_ip
echo -e "\n------------------------------------------------------------------------------"
# 时间一致性检查
echo -e "\n${BLUE}时间一致性检查:${NC}\n"
date_check(){
local server1_ip=$1
local server2_ip=$2
# 时间一致性检查
server1_time=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@$server1_ip date +%s)
server2_time=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@$server2_ip date +%s)
if [ $server1_time -eq $server2_time ]; then
echo -e "两台服务器的时间完全一致。${GREEN}正常${NC}"
else
# 使用 date 命令将时间戳转换为日期和时间
formatted1_time=$(date -d "@$server1_time")
formatted2_time=$(date -d "@$server2_time")
err "两台服务器的时间不一致。"
echo -e "${server1_ip}时间为:${formatted1_time}"
echo -e "${server2_ip}时间为:${formatted2_time}"
fi
}
date_check $server1_ip $server2_ip
echo -e "\n------------------------------------------------------------------------------"
# 添加检查防火墙是否开启
echo -e "\n${BLUE}检查防火墙:${NC}\n"
check_firewall_status() {
local server_ip=$1
# 使用 ssh 连接到服务器并查看 firewalld 服务的状态
firewall_status=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "root@$server_ip" "systemctl is-active firewalld")
if [ "$firewall_status" = "active" ]; then
warn "$server_ip 防火墙已开启。请根据http://172.17.160.32:18090/x/cYA0CQ检查端口"
else
echo "$server_ip 防火墙未开启。"
fi
}
# 调用这个函数并传入服务器 IP 和用户名
check_firewall_status $server1_ip
check_firewall_status $server2_ip
echo -e "\n------------------------------------------------------------------------------"
# DNS配置检查
echo -e "\n${BLUE}DNS检查:${NC}\n"
dns_check(){
local server1_user=$1
local server2_user=$1
local server1_ip=$2
local server2_ip=$3
# DNS配置检查 - 验证是否一致
server1_dns=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server1_user@$server1_ip cat /etc/resolv.conf)
server2_dns=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server2_user@$server2_ip cat /etc/resolv.conf)
if [ "$server1_dns" == "$server2_dns" ]; then
echo -e "两台服务器的DNS配置一致。${GREEN}正常${NC}"
else
warn "两台服务器的DNS配置不一致。请判断是否影响集群。"
fi
# DNS配置检查 - 验证是否存在多行nameserver记录
server1_nameserver_count=$(echo "$server1_dns" | grep -c '^nameserver')
server2_nameserver_count=$(echo "$server2_dns" | grep -c '^nameserver')
if [ $server1_nameserver_count -eq 1 ] && [ $server2_nameserver_count -eq 1 ]; then
echo -e "两台服务器的DNS配置中只存在一行nameserver记录。${GREEN}正常${NC}"
else
if [ $server1_nameserver_count -ne 1 ]; then
warn "第一台服务器($server1_ip)的DNS配置存在多行nameserver记录。请判断是否影响集群。"
fi
if [ $server2_nameserver_count -ne 1 ]; then
warn "第二台服务器($server2_ip)的DNS配置存在多行nameserver记录。请判断是否影响集群。"
fi
fi
}
dns_check $server1_user $server1_ip $server2_ip
echo -e "\n------------------------------------------------------------------------------"
# 挂载检查
echo -e "\n${BLUE}数据盘挂载检查:${NC}\n"
mount_check(){
local server1_user=$1
local server2_user=$1
local server1_ip=$2
local server2_ip=$3
server1_mount=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server1_user@$server1_ip mount | grep /opt/local-path-provisioner)
server2_mount=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server2_user@$server2_ip mount | grep /opt/local-path-provisioner)
if [ -n "$server1_mount" ] && [ -n "$server2_mount" ]; then
echo -e "两台服务器都正确挂载了/opt/local-path-provisioner目录。${GREEN}正常${NC}"
else
# echo -e "两台服务器中有一台或两台未正确挂载/opt/local-path-provisioner目录。${RED}异常${NC}"
if [ -z "$server1_mount" ]; then
err "第一台服务器($server1_ip)未正确挂载/opt/local-path-provisioner目录。"
fi
if [ -z "$server2_mount" ]; then
err "第二台服务器($server2_ip)未正确挂载/opt/local-path-provisioner目录。"
fi
fi
}
mount_check $server1_user $server1_ip $server2_ip
echo -e "\n------------------------------------------------------------------------------"
# 使用Telnet检查SFTP服务是否联通
# check_sftp() {
# local server_ip=$1
# echo -e "\n-----------使用Telnet检查SFTP服务是否联通-------------"
# # # 使用Telnet连接到SSH端口(默认是22)
# # # 尝试连接SFTP服务器
# # sftp -oPort=22 $server_ip <<EOF 2>/dev/null
# # quit
# # EOF
# # # Check the exit status of the SFTP command
# # if [ $? -eq 0 ]; then
# # echo "SFTP on $IP is working normally"
# # else
# # echo "SFTP on $IP is not working!"
# # fi
# # 使用SSH命令测试SFTP服务
# ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null sftpuser@$server_ip sftp exit > /dev/null 2>&1
# # 检查SSH命令的退出状态码
# if [ $? -eq 0 ]; then
# echo "SFTP服务正常,可以连接到主机 $server_ip 的SFTP服务。"
# else
# err "SFTP服务异常,无法连接到主机 $server_ip 的SFTP服务。"
# fi
# }
# # 调用函数来进行Telnet检查
# check_sftp $server_ip
# 检查master SSH服务状态和配置
echo -e "\n${BLUE}master SSH服务状态和配置检查:${NC}\n"
check_sshd_config() {
local server_ip=$1
local server_user=$2
# 使用 ssh 连接到服务器并执行命令检查SSH服务状态
ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "systemctl is-active sshd" > /dev/null 2>&1
if [ $? -eq 0 ]; then
# 获取SSH配置文件内容
sshd_config_content=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "cat /etc/ssh/sshd_config")
# 检查配置文件内容是否包含指定的三行数据
if echo "$sshd_config_content" | grep -q -E "Subsystem\s+sftp\s+internal-sftp" && echo "$sshd_config_content" | grep -q "Match User sftpuser" && echo "$sshd_config_content" | grep -q "ChrootDirectory /opt/ftpfile/sftp/sftpuser/"; then
echo -e "SSH配置正常。${GREEN}正常${NC}"
else
warn "请检查sshd_config文件,是否正确配置sftp。"
fi
else
warn "无法连接服务器或检查SSH服务状态。"
fi
}
# 调用函数执行检查
check_sshd_config $server1_ip $server1_user
echo -e "\n------------------------------------------------------------------------------"
# sftp_with_password() {
# echo -e "\n-----------检查master的sftp连通性-------------"
# local server_ip=$1
# local server_password=$2
# # 使用 expect 来自动输入密码
# expect -c "
# spawn sftp -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null sftpuser@$server_ip
# expect {
# \"password:\" {
# send \"$server_password\n\"
# exp_continue
# }
# eof
# }
# "
# if [ $? -eq 0 ]; then
# echo -e "$server_ip SFTP连接成功。${GREEN}正常${NC}"
# else
# warn "无法连接$server_ip 的SFTP服务。"
# fi
# }
# # 调用函数来进行SFTP连接检查
# sftp_with_password $server1_ip "FYktvR1w2upoOb"
# 检查目录权限是否为777
# check_directory_permission() {
# local server_ip=$1
# local directory_path=$2
# echo -e "\n-----------检查master的目录${directory_path}权限-------------"
# # 使用 ssh 连接到服务器并执行 stat 命令获取目录权限信息
# ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null sftpuser@$server_ip "stat -c %a $directory_path" > /dev/null 2>&1
# if [ $? -eq 0 ]; then
# # 获取目录权限
# directory_permission=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null sftpuser@$server_ip "stat -c %a $directory_path")
# if [ "$directory_permission" -eq 777 ]; then
# echo -e "$directory_path 目录权限为777。${GREEN}正常${NC}"
# else
# warn "$directory_path 目录权限不是777。"
# fi
# else
# warn "无法连接$server_ip 服务器或获取目录权限。"
# fi
# }
echo -e "\n${BLUE}检查master的sftp目录权限:${NC}\n"
check_user() {
ssh $1 "id $2 >/dev/null 2>&1"
if [ $? -eq 0 ]; then
echo -e "$2用户在$1上存在。${GREEN}正常${NC}"
else
err "$2用户在$1上不存在"
fi
}
check_dir_perm() {
ssh $1 "if [ \`stat -c %a $2\` -eq $3 ]; then echo -e '$2目录为$3权限。${GREEN}正常${NC}'; else err '$2目录不为$3权限'; fi"
}
# 调用函数来检查服务器的目录权限
check_user $server1_ip "sftpuser"
check_dir_perm $server1_ip "/opt" 755
check_dir_perm $server1_ip "/opt/ftpfile/sftp/sftpuser" 755
echo -e "\n------------------------------------------------------------------------------"
# check_directory_permission $server1_ip "/opt"
# check_directory_permission $server1_ip "/opt/ftpfile/sftp/sftpuser"
# 检查主机名与/etc/hosts文件的一致性
echo -e "\n${BLUE}检查主机名的一致性:${NC}\n"
check_hostname_and_hosts() {
local server_ip=$1
local server_user=$2
# 使用 ssh 连接到服务器并获取主机名
server_hostname=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "hostname")
if [ $? -eq 0 ]; then
# 获取 /etc/hosts 文件内容
hosts_file_content=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "cat /etc/hosts")
# 检查主机名与 /etc/hosts 内是否一致
if echo "$hosts_file_content" | grep -q -E "^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\s+$server_hostname\s*$"; then
echo -e "$server_ip 主机名与/etc/hosts文件一致。${GREEN}正常${NC}"
else
warn "$server_ip 主机名与/etc/hosts文件不一致。"
fi
else
warn "无法连接$server_ip 服务器或获取主机名。"
fi
}
# 检查主机名与Kubernetes节点名是否一致
check_hostname_and_k8s_node() {
local server_ip=$1
local server_user=$2
local num=$3
# 使用 ssh 连接到服务器并获取Kubernetes节点名
k8s_node_name=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "kubectl get node -o jsonpath='{.items[$num].metadata.name}'")
if [ $? -eq 0 ]; then
# 获取主机名
server_hostname=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "hostname")
# 检查主机名与Kubernetes节点名是否一致
if [ "$server_hostname" = "$k8s_node_name" ]; then
echo -e "$server_ip 主机名与Kubernetes节点名一致。${GREEN}正常${NC}"
else
warn "$server_ip 主机名与Kubernetes节点名不一致。"
fi
else
warn "无法连接$server_ip 服务器或获取Kubernetes节点名。"
fi
}
# 调用函数来检查服务器的主机名与/etc/hosts文件一致性
check_hostname_and_hosts $server1_ip $server1_user
check_hostname_and_hosts $server2_ip $server2_user
# 调用函数来检查服务器的主机名与Kubernetes节点名一致性
check_hostname_and_k8s_node $server1_ip $server1_user 0
check_hostname_and_k8s_node $server2_ip $server2_user 1
echo -e "\n------------------------------------------------------------------------------"
# 磁盘写入速度测试(带缓存)
# disk_speed_test() {
# # local server_ip=$1
# # local server_user=$2
# # local test_file="/tmp/disk_speed_test_file"
# # # 进行写入测试
# # write_speed=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "dd if=/dev/zero of=${test_file} bs=8k count=128 conv=fsync oflag=direct" 2>&1 | tail -n 1)
# # echo -e "${server_ip} 写入速度: $write_speed"
# # # 进行读取测试
# # read_speed=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "sudo dd if=${test_file} of=/dev/null bs=8k count=128 conv=fsync iflag=direct" 2>&1 | tail -n 1)
# # echo -e "${server_ip} 读取速度: $read_speed"
# # # 删除测试文件
# # ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "rm $test_file"
# }
# 测试磁盘读速度的函数
disk_speed_test() {
local server_ip=$1
local test_duration=30 # 测试持续时间(秒)
echo "开始测试 $server_ip 过程会持续30s..."
local time0=$(date "+%s")
cat /dev/null > disk_res
while ((($(date "+%s") - time0) <= test_duration)); do
disk_info=$(ssh "$server_ip" 'dd if=/dev/zero of=output.file bs=8k count=128 conv=fsync 2>&1 1>/dev/null')
io_res=$(echo "$disk_info" | grep --only-matching -E '[0-9.]+ ([MGk]?B|bytes)/[s(ec)?|秒]')
echo "$io_res" >> disk_res
done
local count=$(cat disk_res | wc -l)
local sum=$(cat disk_res | xargs -n2 | awk '{ if ($2 == "kB/秒" || $2 == "kB/s") a+=($1/1024); else a+=$1 } END{printf("%.2f", a)}')
local average_speed=$(awk 'BEGIN{printf "%.2f\n", '$sum'/'$count'}')
echo -e "平均速度 $server_ip: ${RED}$average_speed MB/s${NC}。推荐值:${GREEN}>=200m/s${NC}"
}
# 磁盘写入速度测试(不带缓存)
# disk_speed_test_no_cache() {
# local server_ip=$1
# local server_user=$2
# local test_file="/tmp/disk_speed_test_file"
# # 禁用磁盘缓存
# ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "hdparm -W0 /dev/mapper/centos-root" 2>/dev/null
# # 进行写入测试
# write_speed=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "dd if=/dev/zero of=${test_file} bs=8k count=128 conv=fsync oflag=direct" 2>&1 | tail -n 1)
# echo -e "${server_ip} 写入速度: $write_speed"
# # 进行读取测试
# read_speed=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "sudo dd if=${test_file} of=/dev/null bs=8k count=128 conv=fsync iflag=direct" 2>&1 | tail -n 1)
# echo -e "${server_ip} 读取速度: $read_speed"
# # 删除测试文件
# ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "rm $test_file"
# # 启用磁盘缓存
# ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $server_user@$server_ip "hdparm -W1 /dev/mapper/centos-root" 2>/dev/null
# }
# 测试磁盘写入速度的函数
disk_speed_test_no_cache() {
local server_ip="$1"
local test_duration=30 # 测试持续时间(秒)
echo "开始测试 $server_ip 过程会持续30s..."
local time0=$(date "+%s")
cat /dev/null > disk_res
while ((($(date "+%s") - time0) <= test_duration)); do
disk_info=$(ssh "$server_ip" 'dd if=/dev/zero of=output.file bs=8k count=128 oflag=direct,nonblock conv=fsync 2>&1 1>/dev/null')
io_res=$(echo "$disk_info" | grep --only-matching -E '[0-9.]+ ([MGk]?B|bytes)/[s(ec)?|秒]')
echo "$io_res" >> disk_res
done
local count=$(cat disk_res | wc -l)
local sum=$(cat disk_res | xargs -n2 | awk '{ if ($2 == "kB/秒" || $2 == "kB/s") a+=($1/1024); else a+=$1 } END{printf("%.2f", a)}')
local average_speed=$(awk 'BEGIN{printf "%.2f\n", '$sum'/'$count'}')
echo -e "平均速度 $server_ip: ${RED} $average_speed MB/s ${NC}。推荐值:${GREEN}>=50m/s${NC}"
}
# 检查 CPU 是否支持 AVX
echo -e "\n${BLUE}检查 CPU 是否支持 AVX:${NC}\n"
check_avx_support() {
local server_ip=$1
local avx_check=$(ssh -q -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@$server_ip "grep -o 'avx' /proc/cpuinfo")
if [ -n "$avx_check" ]; then
echo -e "$server_ip CPU 支持 AVX (Advanced Vector Extensions)。${GREEN}正常${NC}"
else
warn "$server_ip CPU 不支持 AVX (Advanced Vector Extensions)。"
fi
}
# 调用函数来检查 AVX 支持
check_avx_support $server1_ip
check_avx_support $server2_ip
# 定义要加载镜像的目录列表
load_images() {
image_directories=("/opt/xx/images/product/insight" "/opt/xx/images/product/insight/patch")
# 遍历目录并加载镜像
for directory in "${image_directories[@]}"; do
if [ -d "$directory" ]; then
cd "$directory"
echo "进入目录: $directory"
for file in $(ls . | grep .tgz); do
echo "加载镜像: $file"
docker load < "$file"
done
else
echo "目录 $directory 不存在。"
fi
done
}
# 检查磁盘性能
echo -e "\n${BLUE}磁盘测试:${NC}\n"
# read -p "是否要检查磁盘? (y/n): " confirm
echo "是否要检查磁盘? (y/n):"
read confirm
if [ "$confirm" = "y" ]; then
echo -e "\n${BLUE}磁盘写入速度测试(不带缓存):${NC}\n"
disk_speed_test_no_cache $server1_ip
disk_speed_test_no_cache $server2_ip
echo -e "\n${BLUE}磁盘写入速度测试(带缓存):${NC}\n"
disk_speed_test $server1_ip
disk_speed_test $server2_ip
else
echo "取消磁盘检查。"
fi
# 镜像丢的时候使用
echo -e "\n${BLUE}镜像加载:${NC}\n"
echo "是否要加载 Docker 镜像? (y/n):"
read confirm
# read -p "是否要加载 Docker 镜像? (y/n): " confirm
if [ "$confirm" = "y" ]; then
load_images
else
echo "取消加载 Docker 镜像。"
fi
```