1. 问题起源
接手一个评论的项目,是用golang写的,采用了gin的web框架
发现里面并没有做热重启,尝试去了解了一下文档
推荐了一个endless的平滑重启库
当我尝试里面写的事例时候,会报错
文档里面提及的操作如下:
Compile the example
$ go build -o simple_server examples/simple.go
Run it
$ ./simple_server
2015/03/22 20:03:29 PID: 2710 localhost:4242
Make a request
$ curl http://localhost:4242/hello
WORLD!
Change the handler - eg. replace the ! with a ?
$ go build -o simple_server examples/simple.go
$ kill -1 2710
The server log says something like:
2015/03/22 20:04:10 2710 Received SIGHUP. forking.
2015/03/22 20:04:10 2710 Received SIGTERM.
2015/03/22 20:04:10 2710 Waiting for connections to finish...
2015/03/22 20:04:10 PID: 2726 localhost:4242
2015/03/22 20:04:10 accept tcp 127.0.0.1:4242: use of closed network connection
2015/03/22 20:04:10 Server on 4242 stopped
Make another request
$ curl http://localhost:4242/hello
WORLD?
实际操作如下:
[root@localhost examples]# go build -o simple simple.go
[root@localhost examples]# ./simple &
[1] 2638
[root@localhost examples]# 2018/04/09 01:50:10 2638 localhost:4242
[root@localhost examples]# go build -o simple simple.go
go install command-line-arguments: open simple: text file busy
2. 跳坑
先提前说一件事,就是我上面的代码是跑在本地的vbox虚拟机上的,系统cetenos7
而代码路径是共享的,也就是说我 宿主机windows 10 和 虚拟机centenos7 都能访问到
这里是一个坑,先跳过去不说,先解释一下普遍遇到text file busy
问题的情况
3. 分析
目的主要是希望修改已经运行的二进制文件,会对运行中的程序造成的影响
不免俗套跟网上一样,采用cp、mv、rm几个命令来看
事例代码如下
// a.c
#include <stdio.h>
#include <unistd.h>
int main() {
printf("hello world2\n");
sleep(10000000);
return 0;
}
4. cp
先尝试修改cp
[root@localhost ~]# gcc a.c -o a // 编译a.c
[root@localhost ~]# ./a & //后台执行程序 a
[2] 3903 // 回显
[root@localhost ~]# hello world2 //程序a的输出
[root@localhost ~]# mkdir ./tmp //创建./tmp目录
[root@localhost ~]# cd ./tmp/ //进入./tmp目录
[root@localhost tmp]# gcc ../a.c -o a //重新编译生成一个可执行文件a,在./tmp目录
// 情况一 cp 不加 -f
[root@localhost tmp]# cp a ../ //把./tmp/a的可执行文件覆盖到上一层
//注意:上一层的可执行文件已经运行了
cp: overwrite ‘../a’? y
cp: cannot create regular file ‘../a’: Text file busy //报错
// 情况二 cp 加 -f
[root@localhost tmp]# cp -f a ../
cp: overwrite ‘../a’? y
[root@localhost tmp]# //无回显成功
[root@localhost tmp]# strace cp a ../ 2>&1 | grep -C 3 'busy' //观察一下
stat("../a", {st_mode=S_IFREG|0755, st_size=8560, ...}) = 0
open("a", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0755, st_size=8560, ...}) = 0
open("../a", O_WRONLY|O_TRUNC) = -1 ETXTBSY (Text file busy)
open("/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=2502, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8b718dc000
[root@localhost tmp]# strace cp -f a ../ 2>&1
...
open("a", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0755, st_size=8560, ...}) = 0
open("../a", O_WRONLY|O_TRUNC) = -1 ETXTBSY (Text file busy)
unlink("../a") = 0
open("../a", O_WRONLY|O_CREAT|O_EXCL, 0755) = 4
fstat(4, {st_mode=S_IFREG|0755, st_size=0, ...}) = 0
fadvise64(3, 0, 0, POSIX_FADV_SEQUENTIAL) = 0
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\2\0>\0\1\0\0\0\220\4@\0\0\0\0
...
cp
有-f
参数的时候,发生open ../a
报Text file busy
失败,会去unlink("../a")
5.inode
An inode stores all the information about a regular file, directory, or other file system
object, except its data and name.
说清楚下面内容之前,得先说清楚inode
文件存储在硬盘上,硬盘的最小存储单位叫做“扇区”(Sector),每个扇区存储512字节(相当于0.5字节)
多个扇区组成的“块”(block),是文件的存储最小单位,最常见的是4kB即8扇区组成块
inode 包含文件的元信息,具体如下
1.文件的字节数
2.文件拥有者的User ID
3.文件的Group ID
4.文件的读、写、执行权限
5.文件的时间戳,共有三个:ctime指inode上一次变动时间; mtime文件内容上一次变动时间;atime指文件上一次打开时间
6.链接数,即有多少文件名指向这个inode
7.文件数据block的位置
stat 命令常看具体某个文件的inode情况
[root@localhost tmp]# stat a
File: ‘a’
Size: 8560 Blocks: 24 IO Block: 4096 regular file
Device: fd00h/64768d Inode: 13662766 Links: 1
Access: (0755/-rwxr-xr-x) Uid: ( 0/ root) Gid: ( 0/ root)
Context: unconfined_u:object_r:admin_home_t:s0
Access: 2018-04-09 11:00:57.379173943 -0400
Modify: 2018-04-09 10:50:37.684332780 -0400
Change: 2018-04-09 10:50:37.684332780 -0400
Birth: -
[root@localhost tmp]# ln a a.ww
[root@localhost tmp]# stat a
File: ‘a’
Size: 8560 Blocks: 24 IO Block: 4096 regular file
Device: fd00h/64768d Inode: 13662766 Links: 2
Access: (0755/-rwxr-xr-x) Uid: ( 0/ root) Gid: ( 0/ root)
Context: unconfined_u:object_r:admin_home_t:s0
Access: 2018-04-09 11:00:57.379173943 -0400
Modify: 2018-04-09 10:50:37.684332780 -0400
Change: 2018-04-09 12:38:19.527611751 -0400
6.它山之石
找到一篇文章写得非常详细知乎专栏,我这里重复一下里面的实验
事实某些操作与文章所写不一样,可以注意下坑点
里面很多知识点,其实在《程序员的自我修养》里面都有深入讲解
一.
删除正在被读写的文件
// 1.c
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#define BUFFER_SIZE 1024
int main(void) {
int fd;
int i = 0;
char buffer[BUFFER_SIZE];
if ((fd=open("data.txt", O_RDONLY)) == -1) {
printf("Open file Error\n");
exit(1);
}
int pid = getpid();
int n = 0;
while(1) {
++i;
n= read(fd, buffer, BUFFER_SIZE-1);
if(n == -1) {
printf("read Error\n");
exit(1);
}
buffer[n] = '\0';
printf("%d pid:%d, fd:%d, content: %s\n", i, pid, fd, buffer);
sleep(1);
lseek(fd, 0L, SEEK_SET);
}
close(fd);
exit(0);
}
// data.txt
[root@localhost ~]# cat data.txt
hello world
执行
root@localhost ~]# gcc 1.c -o 1
[root@localhost ~]# ./1
1 pid:4525, fd:3, content: hello world
2 pid:4525, fd:3, content: hello world
3 pid:4525, fd:3, content: hello world
4 pid:4525, fd:3, content: hello world
5 pid:4525, fd:3, content: hello world
6 pid:4525, fd:3, content: hello world
7 pid:4525, fd:3, content: hello world
...
上面程序1
运行的时候,尝试查看进程中打开文件的inode、data.txt的inode、已经删除过后的inode情况
[root@localhost ~]# ll /proc/4525/fd/3
lr-x------. 1 root root 64 Apr 9 13:11 /proc/4525/fd/3 -> /root/data.txt
[root@localhost ~]# stat /proc/4525/fd/3
File: ‘/proc/4525/fd/3’ -> ‘/root/data.txt’
Size: 64 Blocks: 0 IO Block: 1024 symbolic link
Device: 3h/3d Inode: 37044 Links: 1
Access: (0500/lr-x------) Uid: ( 0/ root) Gid: ( 0/ root)
Context: unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
Access: 2018-04-09 13:11:04.923771194 -0400
Modify: 2018-04-09 13:11:04.923771194 -0400
Change: 2018-04-09 13:11:04.923771194 -0400
Birth: -
[root@localhost ~]# stat /root/data.txt
File: ‘/root/data.txt’
Size: 12 Blocks: 8 IO Block: 4096 regular file
Device: fd00h/64768d Inode: 9543125 Links: 1
Access: (0644/-rw-r--r--) Uid: ( 0/ root) Gid: ( 0/ root)
Context: unconfined_u:object_r:admin_home_t:s0
Access: 2018-04-09 13:10:24.335138927 -0400
Modify: 2018-04-09 13:10:22.605154600 -0400
Change: 2018-04-09 13:10:22.605154600 -0400
Birth: -
// 删除过后
[root@localhost ~]# rm -rf /root/data.txt
[root@localhost ~]# ll /proc/4525/fd/3
lr-x------. 1 root root 64 Apr 9 13:11 /proc/4525/fd/3 -> /root/data.txt (deleted)
[root@localhost ~]# stat /proc/4525/fd/3
File: ‘/proc/4525/fd/3’ -> ‘/root/data.txt (deleted)’
Size: 64 Blocks: 0 IO Block: 1024 symbolic link
Device: 3h/3d Inode: 37044 Links: 1
Access: (0500/lr-x------) Uid: ( 0/ root) Gid: ( 0/ root)
Context: unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
Access: 2018-04-09 13:11:16.676664713 -0400
Modify: 2018-04-09 13:11:04.923771194 -0400
Change: 2018-04-09 13:11:04.923771194 -0400
Birth: -
[root@localhost ~]# stat /root/data.txt
stat: cannot stat ‘/root/data.txt’: No such file or directory
上述程序,删除data.txt之前
程序中所打开的文件描述符/proc/pid/fd/index 其实是 文件本身/root/data.txt 的软链接
正因为是软链接(symbolic link),所以两者的inode是不一样的
/proc/4525/3/index 的 inode 是 37044
/root/data.txt 的 inode 是 9543125
这里看上去与文章中不一致,原因在于参数,需要加上-L
,这样才能看到软链接对应文件的inode
-L, --dereference when showing file information for a symbolic
link, show information for the file the link
references rather than for the link itself
有了以上基础,再来使用两个命令看一下这个软链接
[root@localhost ~]# ls -i1L /proc/4525/fd/3
9543125 /proc/4525/fd/3
[root@localhost ~]# stat -L /proc/4525/fd/3
File: ‘/proc/4525/fd/3’
Size: 12 Blocks: 8 IO Block: 4096 regular file
Device: fd00h/64768d Inode: 9543125 Links: 0
Access: (0644/-rw-r--r--) Uid: ( 0/ root) Gid: ( 0/ root)
Context: unconfined_u:object_r:admin_home_t:s0
Access: 2018-04-09 13:11:35.405495030 -0400
Modify: 2018-04-09 13:10:22.605154600 -0400
Change: 2018-04-09 13:11:34.520503048 -0400
Birth: -
当发生删除文件事,程序并没有发生崩溃,但是已经能看到软链接另一端已经被删除(deleted)
实际系统还是在删除过后,仍然保留了文件内容,直到所有进程都关闭了这一文件
(就算在删除过后手动新建了一个同名文件,那都是新的inode,跟正在运行程序的打开文件没有任何关系)
二. 删除、覆盖正在运行的程序,跟上面是一样的
三. 删除动态链接库,是一样的;但是覆盖操作是允许的,也就是说程序会出现异常