Skip to content

Commit 408aba2

Browse files
committed
Add zombie process tracking
1 parent 1d83dd0 commit 408aba2

File tree

11 files changed

+623
-7
lines changed

11 files changed

+623
-7
lines changed

app/controllers/home_controller.rb

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ def index
3939
prepare_cpu_view
4040
when 'ram'
4141
prepare_ram_view
42+
when 'zombies'
43+
prepare_zombies_view
4244
end
4345

4446
# Get all servers for filter dropdown
@@ -103,5 +105,36 @@ def prepare_ram_view
103105
.order('avg_mem_24h DESC NULLS LAST')
104106
.limit(100)
105107
end
108+
109+
def prepare_zombies_view
110+
# Get servers with zombie processes
111+
server_query = Server.includes(:coolify_team)
112+
server_query = server_query.where(id: @server_id) if @server_id.present?
113+
114+
# Get latest server stats with zombie processes
115+
@servers_by_zombies = server_query
116+
.left_joins(:server_stats)
117+
.select("servers.*,
118+
MAX(CASE WHEN server_stats.captured_at >= NOW() - INTERVAL '24 hours' THEN server_stats.zombie_processes END) as max_zombies_24h,
119+
AVG(CASE WHEN server_stats.captured_at >= NOW() - INTERVAL '24 hours' THEN server_stats.zombie_processes END) as avg_zombies_24h")
120+
.group('servers.id')
121+
.having("MAX(CASE WHEN server_stats.captured_at >= NOW() - INTERVAL '24 hours' THEN server_stats.zombie_processes END) > 0")
122+
.order(Arel.sql('max_zombies_24h DESC NULLS LAST'))
123+
124+
# Get resources with zombie processes
125+
resource_query = Resource.includes(:server, environment: :project)
126+
resource_query = resource_query.where(server_id: @server_id) if @server_id.present?
127+
128+
# Get resources with highest zombie counts
129+
@resources_by_zombies = resource_query
130+
.left_joins(:resource_stats)
131+
.select("resources.*,
132+
MAX(CASE WHEN resource_stats.captured_at >= NOW() - INTERVAL '24 hours' THEN resource_stats.zombie_processes END) as max_zombies_24h,
133+
AVG(CASE WHEN resource_stats.captured_at >= NOW() - INTERVAL '24 hours' THEN resource_stats.zombie_processes END) as avg_zombies_24h")
134+
.group('resources.id')
135+
.having("MAX(CASE WHEN resource_stats.captured_at >= NOW() - INTERVAL '24 hours' THEN resource_stats.zombie_processes END) > 0")
136+
.order(Arel.sql('max_zombies_24h DESC NULLS LAST'))
137+
.limit(100)
138+
end
106139
end
107140

app/controllers/resources_controller.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ def prepare_chart_data(stats)
3838
memory_bytes: stats.map { |s| s.mem_used_bytes },
3939
disk_persistent: stats.map { |s| s.disk_persistent_bytes },
4040
disk_runtime: stats.map { |s| s.disk_runtime_bytes },
41-
disk_total: stats.map { |s| (s.disk_persistent_bytes.to_i + s.disk_runtime_bytes.to_i) }
41+
disk_total: stats.map { |s| (s.disk_persistent_bytes.to_i + s.disk_runtime_bytes.to_i) },
42+
zombie_processes: stats.map { |s| s.zombie_processes }
4243
}
4344
end
4445

@@ -48,6 +49,7 @@ def calculate_stats_summary(stats)
4849
cpu_values = stats.map(&:cpu_pct).compact
4950
mem_values = stats.map(&:mem_used_bytes).compact
5051
disk_values = stats.map { |s| (s.disk_persistent_bytes.to_i + s.disk_runtime_bytes.to_i) }
52+
zombie_values = stats.map(&:zombie_processes).compact
5153

5254
{
5355
cpu_avg: cpu_values.any? ? (cpu_values.sum / cpu_values.size).round(2) : nil,
@@ -59,6 +61,9 @@ def calculate_stats_summary(stats)
5961
disk_avg: disk_values.any? ? (disk_values.sum / disk_values.size).round(0) : nil,
6062
disk_max: disk_values.max,
6163
disk_min: disk_values.min,
64+
zombie_avg: zombie_values.any? ? (zombie_values.sum.to_f / zombie_values.size).round(1) : nil,
65+
zombie_max: zombie_values.max,
66+
zombie_current: zombie_values.last,
6267
data_points: stats.size
6368
}
6469
end

app/controllers/servers_controller.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ def prepare_chart_data(stats)
5959
iops_write: stats.map { |s| s.iops_w&.round(2) },
6060
load1: stats.map { |s| s.load1&.round(2) },
6161
load5: stats.map { |s| s.load5&.round(2) },
62-
load15: stats.map { |s| s.load15&.round(2) }
62+
load15: stats.map { |s| s.load15&.round(2) },
63+
zombie_processes: stats.map { |s| s.zombie_processes }
6364
}
6465
end
6566

@@ -75,6 +76,7 @@ def calculate_stats_summary(stats)
7576
load1_values = stats.map(&:load1).compact
7677
load5_values = stats.map(&:load5).compact
7778
load15_values = stats.map(&:load15).compact
79+
zombie_values = stats.map(&:zombie_processes).compact
7880

7981
{
8082
cpu_avg: cpu_values.any? ? (cpu_values.sum / cpu_values.size).round(2) : nil,
@@ -94,6 +96,9 @@ def calculate_stats_summary(stats)
9496
load1_max: load1_values.max,
9597
load5_avg: load5_values.any? ? (load5_values.sum / load5_values.size).round(2) : nil,
9698
load15_avg: load15_values.any? ? (load15_values.sum / load15_values.size).round(2) : nil,
99+
zombie_avg: zombie_values.any? ? (zombie_values.sum.to_f / zombie_values.size).round(1) : nil,
100+
zombie_max: zombie_values.max,
101+
zombie_current: zombie_values.last,
97102
data_points: stats.size
98103
}
99104
end

app/jobs/resource_metrics_collector_job.rb

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,19 @@ def perform(server_id)
137137

138138
Rails.logger.info "[ResourceMetrics] #{server.name}: Found #{volume_paths.count} unique volume paths"
139139

140-
# 4. Bulk calculate all volume sizes (simple du approach)
140+
# 4. Collect zombie processes for all containers (single SSH call)
141+
Rails.logger.info "[ResourceMetrics] #{server.name}: → Counting zombie processes per container..."
142+
zombie_start = Time.current
143+
zombies_by_container = collect_container_zombies(client, id_to_resource.keys)
144+
zombie_elapsed = (Time.current - zombie_start).round(2)
145+
total_zombies = zombies_by_container.values.sum
146+
if total_zombies > 0
147+
Rails.logger.info "[ResourceMetrics] #{server.name}: Found #{total_zombies} zombie processes across #{zombies_by_container.count { |_, v| v > 0 }} containers in #{zombie_elapsed}s"
148+
else
149+
Rails.logger.info "[ResourceMetrics] #{server.name}: No zombie processes found (#{zombie_elapsed}s)"
150+
end
151+
152+
# 5. Bulk calculate all volume sizes (simple du approach)
141153
volume_sizes = {}
142154
if volume_paths.any?
143155
Rails.logger.info "[ResourceMetrics] #{server.name}: → Calculating volume sizes (#{volume_paths.count} paths)..."
@@ -212,6 +224,9 @@ def perform(server_id)
212224
# Use container limit if set, otherwise use system memory from stats
213225
final_mem_limit = container_mem_limit > 0 ? container_mem_limit : mem_limit
214226

227+
# Get zombie process count for this container
228+
zombie_count = zombies_by_container[cid].to_i
229+
215230
# Log detailed info for first few containers
216231
if collected < 3
217232
limit_source = container_mem_limit > 0 ? "container limit" : "system memory"
@@ -224,6 +239,7 @@ def perform(server_id)
224239
Rails.logger.info "[ResourceMetrics] #{server.name}: #{resource.name}:"
225240
Rails.logger.info "[ResourceMetrics] #{server.name}: MEM: #{used_mb}MB / #{limit_gb}GB (#{limit_source})"
226241
Rails.logger.info "[ResourceMetrics] #{server.name}: DISK: RW=#{rw_mb}MB, Volumes=#{vol_mb}MB (#{mounts.count} mounts)"
242+
Rails.logger.info "[ResourceMetrics] #{server.name}: ZOMBIES: #{zombie_count}" if zombie_count > 0
227243
end
228244

229245
ResourceStat.create!(
@@ -235,7 +251,8 @@ def perform(server_id)
235251
mem_used_bytes: mem_used,
236252
mem_limit_bytes: final_mem_limit,
237253
disk_runtime_bytes: size_rw,
238-
disk_persistent_bytes: container_volume_size
254+
disk_persistent_bytes: container_volume_size,
255+
zombie_processes: zombie_count
239256
)
240257
collected += 1
241258

@@ -304,6 +321,95 @@ def parse_size_to_bytes(size_str)
304321
else value
305322
end.to_i
306323
end
324+
325+
# Collect zombie process counts per container using efficient single SSH call
326+
def collect_container_zombies(client, container_ids)
327+
return {} if container_ids.empty?
328+
329+
# Single compound command to get all data we need
330+
compound_cmd = <<~SHELL.strip
331+
echo "=ZOMBIES=";
332+
ps -eo ppid,state 2>/dev/null | awk '$2=="Z" {print $1}' | sort | uniq -c || echo "";
333+
echo "=CONTAINERS=";
334+
docker inspect --format '{{.State.Pid}} {{.Id}}' #{container_ids.join(' ')} 2>/dev/null || echo "";
335+
echo "=TREE=";
336+
ps -eo pid,ppid 2>/dev/null || echo ""
337+
SHELL
338+
339+
_code, output, _err = client.exec!(compound_cmd, timeout: 30)
340+
341+
# Parse output sections
342+
sections = output.split(/^=/m)
343+
zombies_section = sections.find { |s| s.start_with?('ZOMBIES=') }&.sub('ZOMBIES=', '')&.strip || ''
344+
containers_section = sections.find { |s| s.start_with?('CONTAINERS=') }&.sub('CONTAINERS=', '')&.strip || ''
345+
tree_section = sections.find { |s| s.start_with?('TREE=') }&.sub('TREE=', '')&.strip || ''
346+
347+
# Parse zombie parent PIDs with counts: " 5 1234" -> {1234 => 5}
348+
zombie_parents = {}
349+
zombies_section.each_line do |line|
350+
parts = line.strip.split
351+
next unless parts.length == 2
352+
count = parts[0].to_i
353+
ppid = parts[1].to_i
354+
zombie_parents[ppid] = count if ppid > 0
355+
end
356+
357+
# Parse container PIDs: "12345 abc123def456..." -> {12345 => "abc123..."}
358+
container_pids = {}
359+
pid_to_container = {}
360+
containers_section.each_line do |line|
361+
parts = line.strip.split
362+
next unless parts.length == 2
363+
pid = parts[0].to_i
364+
container_id = parts[1]
365+
if pid > 0 && container_id.present?
366+
container_pids[pid] = container_id
367+
pid_to_container[pid] = container_id
368+
end
369+
end
370+
371+
# Build process tree: {child_pid => parent_pid}
372+
process_tree = {}
373+
tree_section.each_line do |line|
374+
parts = line.strip.split
375+
next unless parts.length == 2
376+
pid = parts[0].to_i
377+
ppid = parts[1].to_i
378+
process_tree[pid] = ppid if pid > 0
379+
end
380+
381+
# Map zombie parents to containers by walking up the tree
382+
zombies_by_container = Hash.new(0)
383+
384+
zombie_parents.each do |ppid, count|
385+
# Walk up the process tree to find a container PID
386+
current_pid = ppid
387+
visited = {}
388+
max_depth = 50
389+
depth = 0
390+
391+
while current_pid && current_pid > 1 && depth < max_depth
392+
# Prevent infinite loops
393+
break if visited[current_pid]
394+
visited[current_pid] = true
395+
396+
# Check if this PID is a container's main process
397+
if (container_id = pid_to_container[current_pid])
398+
zombies_by_container[container_id] += count
399+
break
400+
end
401+
402+
# Move to parent
403+
current_pid = process_tree[current_pid]
404+
depth += 1
405+
end
406+
end
407+
408+
zombies_by_container
409+
rescue => e
410+
Rails.logger.warn "[ResourceMetrics] Failed to collect zombie processes: #{e.message}"
411+
{}
412+
end
307413
end
308414

309415

app/jobs/server_metrics_collector_job.rb

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ def perform(server_id)
3737

3838
Rails.logger.info "[ServerMetrics] #{server.name}: → Collecting filesystem info..."
3939
filesystems = collect_filesystems(client)
40+
41+
Rails.logger.info "[ServerMetrics] #{server.name}: → Counting zombie processes..."
42+
zombie_count = collect_zombie_processes(client)
4043

4144
ServerStat.create!(
4245
server: server,
@@ -51,7 +54,8 @@ def perform(server_id)
5154
load1: load1,
5255
load5: load5,
5356
load15: load15,
54-
filesystems: filesystems
57+
filesystems: filesystems,
58+
zombie_processes: zombie_count
5559
)
5660

5761
elapsed = (Time.current - start_time).round(1)
@@ -237,6 +241,18 @@ def collect_cpu_cores(client)
237241
code, out, _ = client.exec!("nproc 2>/dev/null || grep -c processor /proc/cpuinfo")
238242
out.to_i if code == 0 && out.to_i > 0
239243
end
244+
245+
def collect_zombie_processes(client)
246+
# Count processes in zombie state (Z)
247+
code, out, _err = client.exec!("ps axo state | grep -c '^Z' 2>/dev/null || echo 0")
248+
return out.to_i if code == 0 && out.to_s.strip.match?(/^\d+$/)
249+
250+
# Fallback: count from /proc
251+
code, out, _ = client.exec!("find /proc -maxdepth 1 -type d -name '[0-9]*' -exec cat {}/stat 2>/dev/null \\; | awk '{if ($3 == \"Z\") count++} END {print count+0}'")
252+
code == 0 ? out.to_i : nil
253+
rescue
254+
nil
255+
end
240256
end
241257

242258

0 commit comments

Comments
 (0)