我向github提的第一个issue

缘起

平常经常看到github上有人提issue,今天我也提了一个。是一个kafka镜像起来之后存在僵尸进程的问题。https://github.com/wurstmeister/kafka-docker/issues/497

解决方案

作者说需要引入一个dumb-init方案来解决这个场景,我说为啥不用<https://github.com/phusion/baseimage-docker/blob/rel-0.9.16/image/bin/my_init >这个解决方案,作者说The Phusion solution requires Python - which seems like a lot of extra baggage to pull in (100MBs vs < 1Mb)

dumb-init的解决方案

https://github.com/Yelp/dumb-init#why-you-need-an-init-system

一篇很好的文章

https://blog.phusion.nl/2015/01/20/docker-and-the-pid-1-zombie-reaping-problem/

phusion的解决方案

https://github.com/phusion/baseimage-docker/blob/rel-0.9.16/image/bin/my_init
这个是用python来写的,脚本源码在这,我想弄明白他的原理,为什么他能避免这个问题呢?

其实最主要的就是一个正确的init进程。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341

#!/usr/bin/python3 -u (-u是干嘛的啊,还不知道)
import os, os.path, sys, stat, signal, errno, argparse, time, json, re

KILL_PROCESS_TIMEOUT = 5
KILL_ALL_PROCESSES_TIMEOUT = 5

LOG_LEVEL_ERROR = 1
LOG_LEVEL_WARN = 1
LOG_LEVEL_INFO = 2
LOG_LEVEL_DEBUG = 3

log_level = None

terminated_child_processes = {}

class AlarmException(Exception):
pass

def error(message):
if log_level >= LOG_LEVEL_ERROR:
sys.stderr.write("*** %s\n" % message)

def warn(message):
if log_level >= LOG_LEVEL_WARN:
sys.stderr.write("*** %s\n" % message)

def info(message):
if log_level >= LOG_LEVEL_INFO:
sys.stderr.write("*** %s\n" % message)

def debug(message):
if log_level >= LOG_LEVEL_DEBUG:
sys.stderr.write("*** %s\n" % message)

def ignore_signals_and_raise_keyboard_interrupt(signame):
signal.signal(signal.SIGTERM, signal.SIG_IGN)
signal.signal(signal.SIGINT, signal.SIG_IGN)
raise KeyboardInterrupt(signame)

def raise_alarm_exception():
raise AlarmException('Alarm')

def listdir(path):
try:
result = os.stat(path)
except OSError:
return []
if stat.S_ISDIR(result.st_mode):
return sorted(os.listdir(path))
else:
return []

def is_exe(path):
try:
return os.path.isfile(path) and os.access(path, os.X_OK)
except OSError:
return False

def import_envvars(clear_existing_environment = True, override_existing_environment = True):
new_env = {}
for envfile in listdir("/etc/container_environment"):
name = os.path.basename(envfile)
with open("/etc/container_environment/" + envfile, "r") as f:
# Text files often end with a trailing newline, which we
# don't want to include in the env variable value. See
# https://github.com/phusion/baseimage-docker/pull/49
value = re.sub('\n\Z', '', f.read())
new_env[name] = value
if clear_existing_environment:
os.environ.clear()
for name, value in new_env.items():
if override_existing_environment or not name in os.environ:
os.environ[name] = value

def export_envvars(to_dir = True):
shell_dump = ""
for name, value in os.environ.items():
if name in ['HOME', 'USER', 'GROUP', 'UID', 'GID', 'SHELL']:
continue
if to_dir:
with open("/etc/container_environment/" + name, "w") as f:
f.write(value)
shell_dump += "export " + shquote(name) + "=" + shquote(value) + "\n"
with open("/etc/container_environment.sh", "w") as f:
f.write(shell_dump)
with open("/etc/container_environment.json", "w") as f:
f.write(json.dumps(dict(os.environ)))

_find_unsafe = re.compile(r'[^\w@%+=:,./-]').search

def shquote(s):
"""Return a shell-escaped version of the string *s*."""
if not s:
return "''"
if _find_unsafe(s) is None:
return s

# use single quotes, and put single quotes into double quotes
# the string $'b is then quoted as '$'"'"'b'
return "'" + s.replace("'", "'\"'\"'") + "'"

# Waits for the child process with the given PID, while at the same time
# reaping any other child processes that have exited (e.g. adopted child
# processes that have terminated).(主要逻辑在这)
def waitpid_reap_other_children(pid):
global terminated_child_processes

status = terminated_child_processes.get(pid)
if status:
# A previous call to waitpid_reap_other_children(),
# with an argument not equal to the current argument,
# already waited for this process. Return the status
# that was obtained back then.
del terminated_child_processes[pid]
return status

done = False
status = None
while not done:
try:
this_pid, status = os.waitpid(-1, 0)
if this_pid == pid:
done = True
else:
# Save status for later.
terminated_child_processes[this_pid] = status
except OSError as e:
if e.errno == errno.ECHILD or e.errno == errno.ESRCH:
return None
else:
raise
return status

def stop_child_process(name, pid, signo = signal.SIGTERM, time_limit = KILL_PROCESS_TIMEOUT):
info("Shutting down %s (PID %d)..." % (name, pid))
try:
os.kill(pid, signo)
except OSError:
pass
signal.alarm(time_limit)
try:
try:
waitpid_reap_other_children(pid)
except OSError:
pass
except AlarmException:
warn("%s (PID %d) did not shut down in time. Forcing it to exit." % (name, pid))
try:
os.kill(pid, signal.SIGKILL)
except OSError:
pass
try:
waitpid_reap_other_children(pid)
except OSError:
pass
finally:
signal.alarm(0)

def run_command_killable(*argv):
filename = argv[0]
status = None
pid = os.spawnvp(os.P_NOWAIT, filename, argv)
try:
status = waitpid_reap_other_children(pid)
except BaseException as s:
warn("An error occurred. Aborting.")
stop_child_process(filename, pid)
raise
if status != 0:
if status is None:
error("%s exited with unknown status\n" % filename)
else:
error("%s failed with status %d\n" % (filename, os.WEXITSTATUS(status)))
sys.exit(1)

def run_command_killable_and_import_envvars(*argv):
run_command_killable(*argv)
import_envvars()
export_envvars(False)

def kill_all_processes(time_limit):
info("Killing all processes...")
try:
os.kill(-1, signal.SIGTERM)
except OSError:
pass
signal.alarm(time_limit)
try:
# Wait until no more child processes exist.
done = False
while not done:
try:
os.waitpid(-1, 0)
except OSError as e:
if e.errno == errno.ECHILD:
done = True
else:
raise
except AlarmException:
warn("Not all processes have exited in time. Forcing them to exit.")
try:
os.kill(-1, signal.SIGKILL)
except OSError:
pass
finally:
signal.alarm(0)

def run_startup_files():
# Run /etc/my_init.d/*
for name in listdir("/etc/my_init.d"):
filename = "/etc/my_init.d/" + name
if is_exe(filename):
info("Running %s..." % filename)
run_command_killable_and_import_envvars(filename)

# Run /etc/rc.local.
if is_exe("/etc/rc.local"):
info("Running /etc/rc.local...")
run_command_killable_and_import_envvars("/etc/rc.local")

def start_runit():
info("Booting runit daemon...")
pid = os.spawnl(os.P_NOWAIT, "/usr/bin/runsvdir", "/usr/bin/runsvdir",
"-P", "/etc/service")
info("Runit started as PID %d" % pid)
return pid

def wait_for_runit_or_interrupt(pid):
try:
status = waitpid_reap_other_children(pid)
return (True, status)
except KeyboardInterrupt:
return (False, None)

def shutdown_runit_services():
debug("Begin shutting down runit services...")
os.system("/usr/bin/sv down /etc/service/*")

def wait_for_runit_services():
debug("Waiting for runit services to exit...")
done = False
while not done:
done = os.system("/usr/bin/sv status /etc/service/* | grep -q '^run:'") != 0
if not done:
time.sleep(0.1)

def install_insecure_key():
info("Installing insecure SSH key for user root")
run_command_killable("/usr/sbin/enable_insecure_key")

def main(args):
import_envvars(False, False)
export_envvars()

if args.enable_insecure_key:
install_insecure_key()

if not args.skip_startup_files:
run_startup_files()

runit_exited = False
exit_code = None

if not args.skip_runit:
runit_pid = start_runit()
try:
exit_status = None
if len(args.main_command) == 0:
runit_exited, exit_code = wait_for_runit_or_interrupt(runit_pid)
if runit_exited:
if exit_code is None:
info("Runit exited with unknown status")
exit_status = 1
else:
exit_status = os.WEXITSTATUS(exit_code)
info("Runit exited with status %d" % exit_status)
else:
info("Running %s..." % " ".join(args.main_command))
pid = os.spawnvp(os.P_NOWAIT, args.main_command[0], args.main_command)
try:
exit_code = waitpid_reap_other_children(pid)
if exit_code is None:
info("%s exited with unknown status." % args.main_command[0])
exit_status = 1
else:
exit_status = os.WEXITSTATUS(exit_code)
info("%s exited with status %d." % (args.main_command[0], exit_status))
except KeyboardInterrupt:
stop_child_process(args.main_command[0], pid)
raise
except BaseException as s:
warn("An error occurred. Aborting.")
stop_child_process(args.main_command[0], pid)
raise
sys.exit(exit_status)
finally:
if not args.skip_runit:
shutdown_runit_services()
if not runit_exited:
stop_child_process("runit daemon", runit_pid)
wait_for_runit_services()

# Parse options.
parser = argparse.ArgumentParser(description = 'Initialize the system.')
parser.add_argument('main_command', metavar = 'MAIN_COMMAND', type = str, nargs = '*',
help = 'The main command to run. (default: runit)')
parser.add_argument('--enable-insecure-key', dest = 'enable_insecure_key',
action = 'store_const', const = True, default = False,
help = 'Install the insecure SSH key')
parser.add_argument('--skip-startup-files', dest = 'skip_startup_files',
action = 'store_const', const = True, default = False,
help = 'Skip running /etc/my_init.d/* and /etc/rc.local')
parser.add_argument('--skip-runit', dest = 'skip_runit',
action = 'store_const', const = True, default = False,
help = 'Do not run runit services')
parser.add_argument('--no-kill-all-on-exit', dest = 'kill_all_on_exit',
action = 'store_const', const = False, default = True,
help = 'Don\'t kill all processes on the system upon exiting')
parser.add_argument('--quiet', dest = 'log_level',
action = 'store_const', const = LOG_LEVEL_WARN, default = LOG_LEVEL_INFO,
help = 'Only print warnings and errors')
args = parser.parse_args()
log_level = args.log_level

if args.skip_runit and len(args.main_command) == 0:
error("When --skip-runit is given, you must also pass a main command.")
sys.exit(1)

# Run main function.
signal.signal(signal.SIGTERM, lambda signum, frame: ignore_signals_and_raise_keyboard_interrupt('SIGTERM'))
signal.signal(signal.SIGINT, lambda signum, frame: ignore_signals_and_raise_keyboard_interrupt('SIGINT'))
signal.signal(signal.SIGALRM, lambda signum, frame: raise_alarm_exception())
try:
main(args)
except KeyboardInterrupt:
warn("Init system aborted.")
exit(2)
finally:
if args.kill_all_on_exit:
kill_all_processes(KILL_ALL_PROCESSES_TIMEOUT)
-------------本文结束感谢您的阅读-------------