Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cmd/node-observer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,5 +83,15 @@ func mainInternal(c string) error {
// Controller
g.Add(controller.Start, controller.Stop)

//Register profiling endpoinds if enabled
//if cfg.EnableProfiling {
profilingPort := cfg.ProfilingPort
if profilingPort == 0 {
profilingPort = node_observer.DefaultProfilingPort
}
profilingServer := node_observer.NewProfilingServer(profilingPort)
g.Add(profilingServer.Start, profilingServer.Stop)
//}

return g.Run()
}
1 change: 1 addition & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ type Config struct {
CredsPath *string `yaml:"credentialsPath,omitempty"`
FwdSvcURL *string `yaml:"forwardServiceUrl,omitempty"`
Env map[string]string `yaml:"env"`
EnableProfiling bool `yaml:"enableProfiling,omitempty"`

// derived
Credentials map[string]any
Expand Down
4 changes: 4 additions & 0 deletions pkg/node_observer/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,16 @@ import (
"github.com/NVIDIA/topograph/pkg/topology"
)

const DefaultProfilingPort = 49021

type Config struct {
GenerateTopologyURL string `yaml:"generateTopologyUrl"`
Trigger Trigger `yaml:"trigger"`
Provider topology.Provider `yaml:"provider"`
Engine topology.Engine `yaml:"engine"`
Params map[string]any `yaml:"params"`
EnableProfiling bool `yaml:"enableProfiling,omitempty"`
ProfilingPort int `yaml:"profilingPort,omitempty"`
}

type Trigger struct {
Expand Down
43 changes: 43 additions & 0 deletions pkg/node_observer/profiler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package node_observer

import (
"fmt"
"net"
"net/http"
_ "net/http/pprof"

"k8s.io/klog/v2"
)

type Profiler struct {
listener net.Listener
}

func NewProfilingServer(port int) *Profiler {
// Listen on the specified port for pprof profiling
addr := net.JoinHostPort("localhost", fmt.Sprintf("%d", port))
listener, err := net.Listen("tcp", addr)
if err != nil {
klog.Fatalf("Failed to start profiling server on %s: %v", addr, err)
}

return &Profiler{
listener: listener,
}
}

func (c *Profiler) Start() error {
// Start the pprof server
err := http.Serve(c.listener, nil) // DefaultServeMux will handle pprof endpoints
if err != nil {
klog.Errorf("Failed to start pprof server: %v", err)
return err
}
klog.Infof("Pprofiler server started on %s", c.listener.Addr().String())
return nil
}

func (c *Profiler) Stop(err error) {
klog.Infof("Stopping Pprofiler server: %v", err)
c.listener.Close()
}
11 changes: 11 additions & 0 deletions pkg/server/http_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"io"
"net"
"net/http"
"net/http/pprof"
"time"

"github.com/prometheus/client_golang/prometheus/promhttp"
Expand Down Expand Up @@ -99,6 +100,16 @@ func initHttpServer(ctx context.Context, cfg *config.Config) *HttpServer {
mux.HandleFunc("/healthz", healthz)
mux.Handle("/metrics", promhttp.Handler())

// Register pprof handlers for performance profiling
//if cfg.EnableProfiling {
klog.Infof("Enabling pprof profiling endpoints")
mux.HandleFunc("/debug/pprof/", pprof.Index)
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
//}

return &HttpServer{
ctx: ctx,
cfg: cfg,
Expand Down