Shell脚本误杀Go进程导致自身也被终止的解决方案
Shell脚本误杀Go进程导致自身也被终止的解决方案 服务器常驻Go服务的自动升级是通过脚本覆盖服务的方式实现的。
总体逻辑如下
#!/bin/bash
version=$1
md5=$2
log_path=$3
echo "version:$version; md5:$md5" >> $log_path
## 1. delete server rpm
# 1、删除rpm
rm_cmd="sudo rpm -e server-20240202.rpm"
exec_ret=$($rm_cmd 2>&1)
exec_status=$?
if [ ${exec_status} != 0 ]; then
echo "Failed to remove rpm, exec_ret: $exec_ret" >> $log_path
exit 1
fi
echo "1、remove rpm successfully." >> $log_path
sleep 5
## 2. install new rpm
install_rpm_cmd="sudo rpm -ivh server-20240228.rpm"
exec_ret=$($install_rpm_cmd 2>&1)
exec_status=$?
if [ ${exec_status} != 0 ]; then
echo "Failed to install new rpm, exec_ret: $exec_ret" >> $log_path
exit 1
fi
echo "2、install rpm successfully." >> update.log
sleep 5
## 3. kill old process
command_of_kill="ps -ef | grep server| grep -v update.sh | grep -v grep | awk '{print \$2}' |xargs sudo kill -9"
exec_ret=$(/bin/bash -c "$command_of_kill" 2>&1)
exec_status=$?
if [ ${exec_status} != 0 ]; then
echo "Failed to kill all server's process, exec_ret: $exec_ret" >> $log_path
exit 1
fi
echo "3、kill server's process successfully." >> $log_path
sleep 5
## 4. start server
command_of_server_start="export PATH=\$PATH:/usr/sbin/:/sbin/; cd /usr/local/server/; sudo ./start.sh start"
exec_ret=$(/bin/bash -c "$command_of_server_start" 2>&1)
exec_status=$?
if [ ${exec_status} != 0 ]; then
echo "Failed to start server, exec_ret: $exec_ret" >> $log_path
exit 1
fi
echo "4、start server successfully." >> $log_path
sleep 5
echo "all、Update server successfully." >> $log_path
我通过Go下载了升级脚本和升级rpm,然后使用以下命令执行脚本
upgradeAbsPath:=/tmp/update.sh
logPath := fmt.Sprintf("/tmp/update-%s-%s.log", version, md5)
exec_script := fmt.Sprintf("nohup %s %s %s %s > /dev/null 2>&1 &", upgradeAbsPath, version, md5, logPath)
cmd := exec.Command("/bin/bash", "-c", exec_script)
log.Infof("exec command: /bin/bash -c %s", exec_script)
err = cmd.Run()
if err != nil {
return fmt.Errorf("cmd Run failed, err: %w", err)
}
log.Infof("exec log path %s", logPath)
根据我的理解,它应该能够正常升级,但从输出来看,它在删除进程后就停止了。
1、remove rpm successfully.
2、install rpm successfully.
3、kill server's process successfully.
新服务被替换但未能启动。
我通过每秒打印相关进程看到了同样的情况
========start========
2024-02-28 16:17:03
root 23413 1 0 16:16 ? 00:00:00 /usr/local/server/server
root 23437 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin1 -f /usr/local/server/plugin/plugin1/plugin1.conf
root 23439 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin2 -f /usr/local/server/plugin/plugin1/plugin2.conf
========end========
========start========
2024-02-28 16:17:04
root 23413 1 0 16:16 ? 00:00:00 /usr/local/server/server
root 23437 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin1 -f /usr/local/server/plugin/plugin1/plugin1.conf
root 23439 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin2 -f /usr/local/server/plugin/plugin1/plugin2.conf
root 26815 17337 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26894 26815 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26895 26894 0 16:17 pts/6 00:00:00 sudo rpm -ivh server-20240228.rpm
root 26896 26895 0 16:17 pts/6 00:00:00 rpm -ivh server-20240228.rpm
root 26897 26896 0 16:17 pts/6 00:00:00 rpm -ivh server-20240228.rpm
========end========
========start========
2024-02-28 16:17:05
root 23413 1 0 16:16 ? 00:00:00 /usr/local/server/server
root 23437 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin1 -f /usr/local/server/plugin/plugin1/plugin1.conf
root 23439 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin2 -f /usr/local/server/plugin/plugin1/plugin2.conf
root 26815 17337 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26894 26815 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26895 26894 0 16:17 pts/6 00:00:00 sudo rpm -ivh server-20240228.rpm
root 26896 26895 99 16:17 pts/6 00:00:01 rpm -ivh server-20240228.rpm
========end========
========start========
2024-02-28 16:17:06
root 23413 1 0 16:16 ? 00:00:00 /usr/local/server/server
root 23437 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin1 -f /usr/local/server/plugin/plugin1/plugin1.conf
root 23439 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin2 -f /usr/local/server/plugin/plugin1/plugin2.conf
root 26815 17337 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26894 26815 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26895 26894 0 16:17 pts/6 00:00:00 sudo rpm -ivh server-20240228.rpm
root 26896 26895 99 16:17 pts/6 00:00:02 rpm -ivh server-20240228.rpm
========end========
========start========
2024-02-28 16:17:07
root 23413 1 0 16:16 ? 00:00:00 /usr/local/server/server
root 23437 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin1 -f /usr/local/server/plugin/plugin1/plugin1.conf
root 23439 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin2 -f /usr/local/server/plugin/plugin1/plugin2.conf
root 26815 17337 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26894 26815 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26895 26894 0 16:17 pts/6 00:00:00 sudo rpm -ivh server-20240228.rpm
root 26896 26895 79 16:17 pts/6 00:00:03 rpm -ivh server-20240228.rpm
========end========
========start========
2024-02-28 16:17:08
root 23413 1 0 16:16 ? 00:00:00 /usr/local/server/server
root 23437 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin1 -f /usr/local/server/plugin/plugin1/plugin1.conf
root 23439 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin2 -f /usr/local/server/plugin/plugin1/plugin2.conf
root 26815 17337 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26894 26815 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26895 26894 0 16:17 pts/6 00:00:00 sudo rpm -ivh server-20240228.rpm
root 26896 26895 84 16:17 pts/6 00:00:04 rpm -ivh server-20240228.rpm
========end========
========start========
2024-02-28 16:17:09
root 23413 1 0 16:16 ? 00:00:00 /usr/local/server/server
root 23437 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin1 -f /usr/local/server/plugin/plugin1/plugin1.conf
root 23439 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin2 -f /usr/local/server/plugin/plugin1/plugin2.conf
root 26815 17337 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26894 26815 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26895 26894 0 16:17 pts/6 00:00:00 sudo rpm -ivh server-20240228.rpm
root 26896 26895 88 16:17 pts/6 00:00:05 rpm -ivh server-20240228.rpm
========end========
========start========
2024-02-28 16:17:10
root 23413 1 0 16:16 ? 00:00:00 /usr/local/server/server
root 23437 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin1 -f /usr/local/server/plugin/plugin1/plugin1.conf
root 23439 23413 0 16:16 ? 00:00:00 /usr/local/server/plugins/plugin2 -f /usr/local/server/plugin/plugin1/plugin2.conf
root 26815 17337 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26894 26815 0 16:17 pts/6 00:00:00 /bin/bash /tmp/update.sh 1.0.2 md5_fake /tmp/update.log
root 26895 26894 0 16:17 pts/6 00:00:00 sudo rpm -ivh server-20240228.rpm
root 26896 26895 90 16:17 pts/6 00:00:06 rpm -ivh server-20240228.rpm
========end========
========start========
2024-02-28 16:17:11
========end========
========start========
2024-02-28 16:17:12
========end========
========start========
2024-02-28 16:17:13
========end========
根据相关进程号,按理说升级脚本进程已经与父进程服务器分离,但我不知道为什么它被删除了。
但是当我进入服务器并手动执行以下命令时,它完成了脚本的执行,即升级成功。
nohup /tmp/update.sh 1.0.2 md5_fake /tmp/update.log > /dev/null 2>&1 &
包括进程和日志脚本都可以体现出来。
update.log
1、remove rpm successfully.
2、install rpm successfully.
3、kill server's process successfully.
4、start server successfully.
all、Update server successfully.
所以看起来脚本本身没有问题。我不知道通过Go语言执行升级脚本和手动执行之间有什么区别,导致升级脚本无法完全执行。
我实在找不到问题所在,期待您的帮助。
当shell脚本突然终止子进程(Go进程)时(使用kill命令且未指定进一步指令),操作系统可能也会终止父进程(shell脚本本身),这是由于默认的作业控制行为所致。这是为了防止孤儿进程(没有父进程的子进程)滞留并可能消耗资源。
我尝试使用以下代码将脚本分离到独立的进程组中。
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
if err := cmd.Start(); err != nil {
return err
}
但这似乎没有解决问题。它仍然在同一时间被终止。我该如何防止脚本被杀死?
由于我的服务需要兼容不同的Linux版本,其中大部分是CentOS 6或CentOS 7,少数是RedHat,可能还有其他版本。似乎有些版本不支持 systemctl 命令。我使用脚本是希望它能更具通用性。是否有适用于大多数Linux系统的、类似于systemctl的命令?期待您的回复。
这更像是一个关于Linux进程的问题,而非Go语言问题。本质上,nohup 是不够的,因为它只能防止子进程接收到 HUP 信号。你需要将脚本进程与 Go 进程完全解耦。
一种实现方法是使用 systemd-run 在一个新的控制组中启动一个临时进程,使其与父进程分离:systemd-run
我有种感觉,由于你在这里使用了 os/exec,它无法识别命令末尾的 Unix & 符号,而该符号本应使进程分离。在 Go 中,生成的进程会在 Go 二进制文件退出时立即终止。既然你实际上是在与进程分离,如果分离成功,Go 会收到返回码 0。*cmd 有一个底层的 Process 实例用于控制生成的进程。它有一个 Release() 方法,可以用来代替等待。据我所知,它会释放进程资源,这样当二进制文件退出时,该进程就不会被杀死。
func main() {
fmt.Println("hello world")
}
moluzhui:
是否有适用于大多数 Linux 系统的类似 systemctl 的命令?
看起来你可以使用 service 命令:
systemctl command not working in RHEL 6
标签: centos, rhel, services
提问者:Ranjan Kumar 时间:05:16AM - 29 Nov 17 UTC
再次说明——在 Linux 论坛上你可能会有更好的运气。但看起来 service 命令可能符合要求。关键点在于(这只是我在较新版本的 CentOS 和 Ubuntu 上的经验):你运行的最后一个命令需要在一个命令中同时停止并启动你的可执行文件,否则脚本会在“启动”命令执行之前就停止运行。
我不太熟悉 Linux 系统管理(也许可以尝试在 Linux 论坛上发帖?)。话虽如此:是你的 Go 程序在执行这个脚本吗?比如它应该是自我更新的吗?我曾经在 Linux 上通过使用 systemctl 成功创建了自我更新的 Go 服务。只需在旧程序仍在运行时替换新的可执行文件,然后调用 systemctl restart myservice。如果你想尝试设置一个 systemd 服务,这里有一个很好的指南:
如何使用 Systemctl 管理 Systemd 服务和单元 | DigitalOcean
Systemd 是一个初始化系统和系统管理器,已成为 Linux 发行版的新标准。在本指南中,我们将讨论 systemctl 命令…
问题在于你的Go程序执行升级脚本时,脚本杀死了所有包含"server"的进程,包括执行脚本的Go进程本身。从进程监控可以看到,升级脚本进程(26815)的父进程是17337,这个17337很可能就是你的Go服务进程。
这是你的kill命令的问题:
ps -ef | grep server| grep -v update.sh | grep -v grep | awk '{print \$2}' |xargs sudo kill -9
这个命令会杀死所有包含"server"的进程,但只排除了"update.sh"。当Go程序执行升级脚本时,Go进程本身可能包含"server"这个字符串(比如在进程路径或参数中),所以也被杀死了。
解决方案:在Go中执行升级脚本时,让脚本在独立的进程组中运行,这样kill命令就不会影响到Go进程。修改你的Go代码:
package main
import (
"fmt"
"log"
"os/exec"
"syscall"
)
func main() {
upgradeAbsPath := "/tmp/update.sh"
version := "1.0.2"
md5 := "md5_fake"
logPath := fmt.Sprintf("/tmp/update-%s-%s.log", version, md5)
// 使用setsid创建新的会话和进程组
cmd := exec.Command("setsid", "bash", "-c",
fmt.Sprintf("%s %s %s %s > /dev/null 2>&1",
upgradeAbsPath, version, md5, logPath))
// 设置进程属性,确保独立进程组
cmd.SysProcAttr = &syscall.SysProcAttr{
Setsid: true,
}
log.Printf("exec command: setsid bash -c %s %s %s %s",
upgradeAbsPath, version, md5, logPath)
err := cmd.Start()
if err != nil {
log.Fatalf("cmd Start failed, err: %v", err)
}
// 立即释放资源,不等待进程结束
err = cmd.Process.Release()
if err != nil {
log.Printf("Process Release failed, err: %v", err)
}
log.Printf("exec log path %s", logPath)
}
或者,修改shell脚本的kill命令,更精确地识别要杀死的进程。在update.sh中添加更严格的过滤:
#!/bin/bash
version=$1
md5=$2
log_path=$3
echo "version:$version; md5:$md5" >> $log_path
# 获取当前脚本的进程ID和进程组ID
SCRIPT_PID=$$
SCRIPT_PGID=$(ps -o pgid= $SCRIPT_PID | grep -o '[0-9]*')
## 1. delete server rpm
rm_cmd="sudo rpm -e server-20240202.rpm"
exec_ret=$($rm_cmd 2>&1)
exec_status=$?
if [ ${exec_status} != 0 ]; then
echo "Failed to remove rpm, exec_ret: $exec_ret" >> $log_path
exit 1
fi
echo "1、remove rpm successfully." >> $log_path
sleep 5
## 2. install new rpm
install_rpm_cmd="sudo rpm -ivh server-20240228.rpm"
exec_ret=$($install_rpm_cmd 2>&1)
exec_status=$?
if [ ${exec_status} != 0 ]; then
echo "Failed to install new rpm, exec_ret: $exec_ret" >> $log_path
exit 1
fi
echo "2、install rpm successfully." >> $log_path
sleep 5
## 3. kill old process - 改进版,排除当前进程组
# 只杀死/usr/local/server/目录下的server进程
command_of_kill="ps -ef | grep '/usr/local/server/' | grep -v grep | awk '{print \$2}' | xargs sudo kill -9 2>/dev/null || true"
exec_ret=$(/bin/bash -c "$command_of_kill" 2>&1)
exec_status=$?
if [ ${exec_status} != 0 ]; then
echo "Failed to kill all server's process, exec_ret: $exec_ret" >> $log_path
# 这里不退出,继续尝试启动新服务
fi
echo "3、kill server's process successfully." >> $log_path
sleep 5
## 4. start server
command_of_server_start="export PATH=\$PATH:/usr/sbin/:/sbin/; cd /usr/local/server/; sudo ./start.sh start"
exec_ret=$(/bin/bash -c "$command_of_server_start" 2>&1)
exec_status=$?
if [ ${exec_status} != 0 ]; then
echo "Failed to start server, exec_ret: $exec_ret" >> $log_path
exit 1
fi
echo "4、start server successfully." >> $log_path
sleep 5
echo "all、Update server successfully." >> $log_path
关键改进:
- 在Go中使用
setsid创建独立进程组 - 在shell脚本中更精确地过滤要杀死的进程,只针对
/usr/local/server/目录下的进程 - 使用
cmd.Start()而不是cmd.Run(),并立即释放进程资源
这样升级脚本就不会误杀执行它的Go进程了。


