@@ -137,7 +137,19 @@ def perform(server_id)
137137
138138 Rails . logger . info "[ResourceMetrics] #{ server . name } : Found #{ volume_paths . count } unique volume paths"
139139
140- # 4. Bulk calculate all volume sizes (simple du approach)
140+ # 4. Collect zombie processes for all containers (single SSH call)
141+ Rails . logger . info "[ResourceMetrics] #{ server . name } : → Counting zombie processes per container..."
142+ zombie_start = Time . current
143+ zombies_by_container = collect_container_zombies ( client , id_to_resource . keys )
144+ zombie_elapsed = ( Time . current - zombie_start ) . round ( 2 )
145+ total_zombies = zombies_by_container . values . sum
146+ if total_zombies > 0
147+ Rails . logger . info "[ResourceMetrics] #{ server . name } : Found #{ total_zombies } zombie processes across #{ zombies_by_container . count { |_ , v | v > 0 } } containers in #{ zombie_elapsed } s"
148+ else
149+ Rails . logger . info "[ResourceMetrics] #{ server . name } : No zombie processes found (#{ zombie_elapsed } s)"
150+ end
151+
152+ # 5. Bulk calculate all volume sizes (simple du approach)
141153 volume_sizes = { }
142154 if volume_paths . any?
143155 Rails . logger . info "[ResourceMetrics] #{ server . name } : → Calculating volume sizes (#{ volume_paths . count } paths)..."
@@ -212,6 +224,9 @@ def perform(server_id)
212224 # Use container limit if set, otherwise use system memory from stats
213225 final_mem_limit = container_mem_limit > 0 ? container_mem_limit : mem_limit
214226
227+ # Get zombie process count for this container
228+ zombie_count = zombies_by_container [ cid ] . to_i
229+
215230 # Log detailed info for first few containers
216231 if collected < 3
217232 limit_source = container_mem_limit > 0 ? "container limit" : "system memory"
@@ -224,6 +239,7 @@ def perform(server_id)
224239 Rails . logger . info "[ResourceMetrics] #{ server . name } : #{ resource . name } :"
225240 Rails . logger . info "[ResourceMetrics] #{ server . name } : MEM: #{ used_mb } MB / #{ limit_gb } GB (#{ limit_source } )"
226241 Rails . logger . info "[ResourceMetrics] #{ server . name } : DISK: RW=#{ rw_mb } MB, Volumes=#{ vol_mb } MB (#{ mounts . count } mounts)"
242+ Rails . logger . info "[ResourceMetrics] #{ server . name } : ZOMBIES: #{ zombie_count } " if zombie_count > 0
227243 end
228244
229245 ResourceStat . create! (
@@ -235,7 +251,8 @@ def perform(server_id)
235251 mem_used_bytes : mem_used ,
236252 mem_limit_bytes : final_mem_limit ,
237253 disk_runtime_bytes : size_rw ,
238- disk_persistent_bytes : container_volume_size
254+ disk_persistent_bytes : container_volume_size ,
255+ zombie_processes : zombie_count
239256 )
240257 collected += 1
241258
@@ -304,6 +321,95 @@ def parse_size_to_bytes(size_str)
304321 else value
305322 end . to_i
306323 end
324+
325+ # Collect zombie process counts per container using efficient single SSH call
326+ def collect_container_zombies ( client , container_ids )
327+ return { } if container_ids . empty?
328+
329+ # Single compound command to get all data we need
330+ compound_cmd = <<~SHELL . strip
331+ echo "=ZOMBIES=";
332+ ps -eo ppid,state 2>/dev/null | awk '$2=="Z" {print $1}' | sort | uniq -c || echo "";
333+ echo "=CONTAINERS=";
334+ docker inspect --format '{{.State.Pid}} {{.Id}}' #{ container_ids . join ( ' ' ) } 2>/dev/null || echo "";
335+ echo "=TREE=";
336+ ps -eo pid,ppid 2>/dev/null || echo ""
337+ SHELL
338+
339+ _code , output , _err = client . exec! ( compound_cmd , timeout : 30 )
340+
341+ # Parse output sections
342+ sections = output . split ( /^=/m )
343+ zombies_section = sections . find { |s | s . start_with? ( 'ZOMBIES=' ) } &.sub ( 'ZOMBIES=' , '' ) &.strip || ''
344+ containers_section = sections . find { |s | s . start_with? ( 'CONTAINERS=' ) } &.sub ( 'CONTAINERS=' , '' ) &.strip || ''
345+ tree_section = sections . find { |s | s . start_with? ( 'TREE=' ) } &.sub ( 'TREE=' , '' ) &.strip || ''
346+
347+ # Parse zombie parent PIDs with counts: " 5 1234" -> {1234 => 5}
348+ zombie_parents = { }
349+ zombies_section . each_line do |line |
350+ parts = line . strip . split
351+ next unless parts . length == 2
352+ count = parts [ 0 ] . to_i
353+ ppid = parts [ 1 ] . to_i
354+ zombie_parents [ ppid ] = count if ppid > 0
355+ end
356+
357+ # Parse container PIDs: "12345 abc123def456..." -> {12345 => "abc123..."}
358+ container_pids = { }
359+ pid_to_container = { }
360+ containers_section . each_line do |line |
361+ parts = line . strip . split
362+ next unless parts . length == 2
363+ pid = parts [ 0 ] . to_i
364+ container_id = parts [ 1 ]
365+ if pid > 0 && container_id . present?
366+ container_pids [ pid ] = container_id
367+ pid_to_container [ pid ] = container_id
368+ end
369+ end
370+
371+ # Build process tree: {child_pid => parent_pid}
372+ process_tree = { }
373+ tree_section . each_line do |line |
374+ parts = line . strip . split
375+ next unless parts . length == 2
376+ pid = parts [ 0 ] . to_i
377+ ppid = parts [ 1 ] . to_i
378+ process_tree [ pid ] = ppid if pid > 0
379+ end
380+
381+ # Map zombie parents to containers by walking up the tree
382+ zombies_by_container = Hash . new ( 0 )
383+
384+ zombie_parents . each do |ppid , count |
385+ # Walk up the process tree to find a container PID
386+ current_pid = ppid
387+ visited = { }
388+ max_depth = 50
389+ depth = 0
390+
391+ while current_pid && current_pid > 1 && depth < max_depth
392+ # Prevent infinite loops
393+ break if visited [ current_pid ]
394+ visited [ current_pid ] = true
395+
396+ # Check if this PID is a container's main process
397+ if ( container_id = pid_to_container [ current_pid ] )
398+ zombies_by_container [ container_id ] += count
399+ break
400+ end
401+
402+ # Move to parent
403+ current_pid = process_tree [ current_pid ]
404+ depth += 1
405+ end
406+ end
407+
408+ zombies_by_container
409+ rescue => e
410+ Rails . logger . warn "[ResourceMetrics] Failed to collect zombie processes: #{ e . message } "
411+ { }
412+ end
307413end
308414
309415
0 commit comments