Skip to content

Commit 8dab537

Browse files
committed
feat(metrics): add configurable metrics level filtering for Prometheus exporter
- Add configurable metrics level filtering to control which metric types are exported by the Prometheus exporter - Implement metrics level configuration via command line flags and YAML config - Add support for granular control over node, process, container, VM, and pod metrics - Added tests Supported metric levels: node, process, container, vm, pod, all, none Signed-off-by: Vimal Kumar <vimal78@gmail.com>
1 parent 63fc7f6 commit 8dab537

File tree

11 files changed

+969
-45
lines changed

11 files changed

+969
-45
lines changed

cmd/kepler/main.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,11 +194,15 @@ func createServices(logger *slog.Logger, cfg *config.Config) ([]service.Service,
194194
func createPrometheusExporter(logger *slog.Logger, cfg *config.Config, apiServer *server.APIServer, pm *monitor.PowerMonitor) (*prometheus.Exporter, error) {
195195
logger.Debug("Creating Prometheus exporter")
196196

197+
// Use metrics level from configuration (already parsed)
198+
metricsLevel := cfg.Exporter.Prometheus.MetricsLevel
199+
197200
collectors, err := prometheus.CreateCollectors(
198201
pm,
199202
prometheus.WithLogger(logger),
200203
prometheus.WithProcFSPath(cfg.Host.ProcFS),
201204
prometheus.WithNodeName(cfg.Kube.Node),
205+
prometheus.WithMetricsLevel(metricsLevel),
202206
)
203207
if err != nil {
204208
return nil, fmt.Errorf("failed to create Prometheus collectors: %w", err)

compose/dev/kepler-dev/etc/kepler/config.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ exporter:
3636
debugCollectors:
3737
- go
3838
- process
39+
# metricsLevel: all
40+
metricsLevel:
41+
- node
42+
- container
43+
- process
3944

4045
debug: # debug related config
4146
pprof: # pprof related config

config/config.go

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"time"
1212

1313
"github.com/alecthomas/kingpin/v2"
14+
"github.com/sustainable-computing-io/kepler/internal/exporter/prometheus/metrics"
1415
"gopkg.in/yaml.v3"
1516
"k8s.io/utils/ptr"
1617
)
@@ -53,8 +54,9 @@ type (
5354
}
5455

5556
PrometheusExporter struct {
56-
Enabled *bool `yaml:"enabled"`
57-
DebugCollectors []string `yaml:"debugCollectors"`
57+
Enabled *bool `yaml:"enabled"`
58+
DebugCollectors []string `yaml:"debugCollectors"`
59+
MetricsLevel metrics.Level `yaml:"metricsLevel"`
5860
}
5961

6062
Exporter struct {
@@ -91,6 +93,55 @@ type (
9193
}
9294
)
9395

96+
// MetricsLevelValue is a custom kingpin.Value that parses metrics levels directly into metrics.Level
97+
type MetricsLevelValue struct {
98+
level *metrics.Level
99+
}
100+
101+
// NewMetricsLevelValue creates a new MetricsLevelValue with the given target
102+
func NewMetricsLevelValue(target *metrics.Level) *MetricsLevelValue {
103+
return &MetricsLevelValue{level: target}
104+
}
105+
106+
// Set implements kingpin.Value interface - parses and accumulates metrics levels
107+
func (m *MetricsLevelValue) Set(value string) error {
108+
// Parse the single value into a level
109+
level, err := metrics.ParseLevel([]string{value})
110+
if err != nil {
111+
return err
112+
}
113+
114+
// If this is the first value and it's not a special "all" or "none" case,
115+
// initialize to none first to clear any default
116+
if *m.level == metrics.MetricsLevelAll && value != "all" {
117+
*m.level = metrics.MetricsLevelNone
118+
}
119+
120+
// Handle special cases
121+
if value == "all" {
122+
*m.level = metrics.MetricsLevelAll
123+
return nil
124+
}
125+
if value == "none" {
126+
*m.level = metrics.MetricsLevelNone
127+
return nil
128+
}
129+
130+
// Accumulate the level using bitwise OR
131+
*m.level |= level
132+
return nil
133+
}
134+
135+
// String implements kingpin.Value interface
136+
func (m *MetricsLevelValue) String() string {
137+
return m.level.String()
138+
}
139+
140+
// IsCumulative implements kingpin.Value interface to support multiple values
141+
func (m *MetricsLevelValue) IsCumulative() bool {
142+
return true
143+
}
144+
94145
type SkipValidation int
95146

96147
const (
@@ -122,6 +173,7 @@ const (
122173
ExporterPrometheusEnabledFlag = "exporter.prometheus"
123174
// NOTE: not a flag
124175
ExporterPrometheusDebugCollectors = "exporter.prometheus.debug-collectors"
176+
ExporterPrometheusMetricsFlag = "metrics"
125177

126178
// kubernetes flags
127179
KubernetesFlag = "kube.enable"
@@ -156,6 +208,7 @@ func DefaultConfig() *Config {
156208
Prometheus: PrometheusExporter{
157209
Enabled: ptr.To(true),
158210
DebugCollectors: []string{"go"},
211+
MetricsLevel: metrics.MetricsLevelAll,
159212
},
160213
},
161214
Debug: Debug{
@@ -252,6 +305,9 @@ func RegisterFlags(app *kingpin.Application) ConfigUpdaterFn {
252305

253306
prometheusExporterEnabled := app.Flag(ExporterPrometheusEnabledFlag, "Enable Prometheus exporter").Default("true").Bool()
254307

308+
var metricsLevel = metrics.MetricsLevelAll
309+
app.Flag(ExporterPrometheusMetricsFlag, "Metrics levels to export (node,process,container,vm,pod,all,none)").SetValue(NewMetricsLevelValue(&metricsLevel))
310+
255311
kubernetes := app.Flag(KubernetesFlag, "Monitor kubernetes").Default("false").Bool()
256312
kubeconfig := app.Flag(KubeConfigFlag, "Path to a kubeconfig. Only required if out-of-cluster.").ExistingFile()
257313
nodeName := app.Flag(KubeNodeNameFlag, "Name of kubernetes node on which kepler is running.").String()
@@ -295,6 +351,10 @@ func RegisterFlags(app *kingpin.Application) ConfigUpdaterFn {
295351
cfg.Exporter.Prometheus.Enabled = prometheusExporterEnabled
296352
}
297353

354+
if flagsSet[ExporterPrometheusMetricsFlag] {
355+
cfg.Exporter.Prometheus.MetricsLevel = metricsLevel
356+
}
357+
298358
if flagsSet[KubernetesFlag] {
299359
cfg.Kube.Enabled = kubernetes
300360
}
@@ -468,6 +528,7 @@ func (c *Config) manualString() string {
468528
{ExporterStdoutEnabledFlag, fmt.Sprintf("%v", c.Exporter.Stdout.Enabled)},
469529
{ExporterPrometheusEnabledFlag, fmt.Sprintf("%v", c.Exporter.Prometheus.Enabled)},
470530
{ExporterPrometheusDebugCollectors, strings.Join(c.Exporter.Prometheus.DebugCollectors, ", ")},
531+
{ExporterPrometheusMetricsFlag, c.Exporter.Prometheus.MetricsLevel.String()},
471532
{pprofEnabledFlag, fmt.Sprintf("%v", c.Debug.Pprof.Enabled)},
472533
{KubeConfigFlag, fmt.Sprintf("%v", c.Kube.Config)},
473534
}

config/config_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ import (
1212

1313
"github.com/alecthomas/kingpin/v2"
1414
"github.com/stretchr/testify/assert"
15+
"github.com/sustainable-computing-io/kepler/internal/exporter/prometheus/metrics"
16+
"gopkg.in/yaml.v3"
1517
"k8s.io/utils/ptr"
1618
)
1719

@@ -805,3 +807,63 @@ exporter:
805807
assert.Equal(t, exp.String(), cfg.String())
806808
})
807809
}
810+
811+
func TestMetricsLevelYAMLMarshalling(t *testing.T) {
812+
tests := []struct {
813+
name string
814+
metricsLevel metrics.Level
815+
expectedYAML string
816+
}{
817+
{
818+
name: "None",
819+
metricsLevel: metrics.MetricsLevelNone,
820+
expectedYAML: "none",
821+
},
822+
{
823+
name: "All",
824+
metricsLevel: metrics.MetricsLevelAll,
825+
expectedYAML: "all",
826+
},
827+
{
828+
name: "Node only",
829+
metricsLevel: metrics.MetricsLevelNode,
830+
expectedYAML: "node",
831+
},
832+
{
833+
name: "Pod and Node",
834+
metricsLevel: metrics.MetricsLevelPod | metrics.MetricsLevelNode,
835+
expectedYAML: "metricsLevel:\n - node\n - pod",
836+
},
837+
{
838+
name: "Node and Process",
839+
metricsLevel: metrics.MetricsLevelNode | metrics.MetricsLevelProcess,
840+
expectedYAML: "metricsLevel:\n - node\n - process",
841+
},
842+
}
843+
844+
for _, tt := range tests {
845+
t.Run(tt.name, func(t *testing.T) {
846+
cfg := DefaultConfig()
847+
cfg.Exporter.Prometheus.MetricsLevel = tt.metricsLevel
848+
849+
// Marshal the prometheus exporter section
850+
data, err := yaml.Marshal(cfg.Exporter.Prometheus)
851+
assert.NoError(t, err)
852+
853+
yamlStr := string(data)
854+
855+
// Check that the YAML contains the expected metrics level representation
856+
assert.Contains(t, yamlStr, tt.expectedYAML, "YAML should contain expected metrics level representation")
857+
858+
// Importantly, it should NOT contain the integer representation
859+
integerStr := fmt.Sprintf("metricsLevel: %d", tt.metricsLevel)
860+
assert.NotContains(t, yamlStr, integerStr, "YAML should not contain integer representation")
861+
862+
// Test round-trip: unmarshal back and verify it's the same
863+
var unmarshaled PrometheusExporter
864+
err = yaml.Unmarshal(data, &unmarshaled)
865+
assert.NoError(t, err)
866+
assert.Equal(t, tt.metricsLevel, unmarshaled.MetricsLevel)
867+
})
868+
}
869+
}

hack/gen-metric-docs/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
"github.com/prometheus/client_golang/prometheus"
1717
"github.com/sustainable-computing-io/kepler/internal/exporter/prometheus/collector"
18+
"github.com/sustainable-computing-io/kepler/internal/exporter/prometheus/metrics"
1819
"github.com/sustainable-computing-io/kepler/internal/monitor"
1920
)
2021

@@ -260,7 +261,7 @@ func main() {
260261
fmt.Println("Creating collectors...")
261262
// Create a logger for the collectors
262263
logger := slog.New(slog.NewTextHandler(os.Stdout, nil))
263-
powerCollector := collector.NewPowerCollector(mockMonitor, "test-node", logger)
264+
powerCollector := collector.NewPowerCollector(mockMonitor, "test-node", logger, metrics.MetricsLevelAll)
264265
fmt.Println("Created power collector")
265266
buildInfoCollector := collector.NewKeplerBuildInfoCollector()
266267
fmt.Println("Created build info collector")

internal/exporter/prometheus/collector/power_collector.go

Lines changed: 56 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"time"
1111

1212
"github.com/prometheus/client_golang/prometheus"
13+
"github.com/sustainable-computing-io/kepler/internal/exporter/prometheus/metrics"
1314
"github.com/sustainable-computing-io/kepler/internal/monitor"
1415
)
1516

@@ -20,8 +21,9 @@ type PowerDataProvider = monitor.PowerDataProvider
2021
// PowerCollector combines Node, Process, and Container collectors to ensure data consistency
2122
// by fetching all data in a single atomic operation during collection
2223
type PowerCollector struct {
23-
pm PowerDataProvider
24-
logger *slog.Logger
24+
pm PowerDataProvider
25+
logger *slog.Logger
26+
metricsLevel metrics.Level
2527

2628
// Lock to ensure thread safety during collection
2729
mutex sync.RWMutex
@@ -95,7 +97,7 @@ func timeDesc(level, device, nodeName string, labels []string) *prometheus.Desc
9597

9698
// NewPowerCollector creates a collector that provides consistent metrics
9799
// by fetching all data in a single snapshot during collection
98-
func NewPowerCollector(monitor PowerDataProvider, nodeName string, logger *slog.Logger) *PowerCollector {
100+
func NewPowerCollector(monitor PowerDataProvider, nodeName string, logger *slog.Logger, metricsLevel metrics.Level) *PowerCollector {
99101
const (
100102
// these labels should remain the same across all descriptors to ease querying
101103
zone = "zone"
@@ -105,8 +107,9 @@ func NewPowerCollector(monitor PowerDataProvider, nodeName string, logger *slog.
105107
)
106108

107109
c := &PowerCollector{
108-
pm: monitor,
109-
logger: logger.With("collector", "power"),
110+
pm: monitor,
111+
logger: logger.With("collector", "power"),
112+
metricsLevel: metricsLevel,
110113

111114
nodeCPUJoulesDescriptor: joulesDesc("node", "cpu", nodeName, []string{zone, "path"}),
112115
nodeCPUWattsDescriptor: wattsDesc("node", "cpu", nodeName, []string{zone, "path"}),
@@ -151,33 +154,43 @@ func (c *PowerCollector) waitForData() {
151154
// Describe implements the prometheus.Collector interface
152155
func (c *PowerCollector) Describe(ch chan<- *prometheus.Desc) {
153156
// node
154-
ch <- c.nodeCPUJoulesDescriptor
155-
ch <- c.nodeCPUWattsDescriptor
156-
ch <- c.nodeCPUUsageRatioDescriptor
157-
// node cpu active
158-
ch <- c.nodeCPUActiveJoulesDesc
159-
ch <- c.nodeCPUActiveWattsDesc
160-
// node cpu idle
161-
ch <- c.nodeCPUIdleJoulesDesc
162-
ch <- c.nodeCPUIdleWattsDesc
157+
if c.metricsLevel.IsNodeEnabled() {
158+
ch <- c.nodeCPUJoulesDescriptor
159+
ch <- c.nodeCPUWattsDescriptor
160+
ch <- c.nodeCPUUsageRatioDescriptor
161+
// node cpu active
162+
ch <- c.nodeCPUActiveJoulesDesc
163+
ch <- c.nodeCPUActiveWattsDesc
164+
// node cpu idle
165+
ch <- c.nodeCPUIdleJoulesDesc
166+
ch <- c.nodeCPUIdleWattsDesc
167+
}
163168

164169
// process
165-
ch <- c.processCPUJoulesDescriptor
166-
ch <- c.processCPUWattsDescriptor
167-
ch <- c.processCPUTimeDescriptor
170+
if c.metricsLevel.IsProcessEnabled() {
171+
ch <- c.processCPUJoulesDescriptor
172+
ch <- c.processCPUWattsDescriptor
173+
ch <- c.processCPUTimeDescriptor
174+
}
168175

169176
// container
170-
ch <- c.containerCPUJoulesDescriptor
171-
ch <- c.containerCPUWattsDescriptor
172-
// ch <- c.containerCPUTimeDescriptor // TODO: add conntainerCPUTimeDescriptor
177+
if c.metricsLevel.IsContainerEnabled() {
178+
ch <- c.containerCPUJoulesDescriptor
179+
ch <- c.containerCPUWattsDescriptor
180+
// ch <- c.containerCPUTimeDescriptor // TODO: add conntainerCPUTimeDescriptor
181+
}
173182

174183
// vm
175-
ch <- c.vmCPUJoulesDescriptor
176-
ch <- c.vmCPUWattsDescriptor
184+
if c.metricsLevel.IsVMEnabled() {
185+
ch <- c.vmCPUJoulesDescriptor
186+
ch <- c.vmCPUWattsDescriptor
187+
}
177188

178189
// pod
179-
ch <- c.podCPUJoulesDescriptor
180-
ch <- c.podCPUWattsDescriptor
190+
if c.metricsLevel.IsPodEnabled() {
191+
ch <- c.podCPUJoulesDescriptor
192+
ch <- c.podCPUWattsDescriptor
193+
}
181194
}
182195

183196
func (c *PowerCollector) isReady() bool {
@@ -205,13 +218,26 @@ func (c *PowerCollector) Collect(ch chan<- prometheus.Metric) {
205218
return
206219
}
207220

208-
c.collectNodeMetrics(ch, snapshot.Node)
209-
c.collectProcessMetrics(ch, "running", snapshot.Processes)
210-
c.collectProcessMetrics(ch, "terminated", snapshot.TerminatedProcesses)
221+
if c.metricsLevel.IsNodeEnabled() {
222+
c.collectNodeMetrics(ch, snapshot.Node)
223+
}
211224

212-
c.collectContainerMetrics(ch, snapshot.Containers)
213-
c.collectVMMetrics(ch, snapshot.VirtualMachines)
214-
c.collectPodMetrics(ch, snapshot.Pods)
225+
if c.metricsLevel.IsProcessEnabled() {
226+
c.collectProcessMetrics(ch, "running", snapshot.Processes)
227+
c.collectProcessMetrics(ch, "terminated", snapshot.TerminatedProcesses)
228+
}
229+
230+
if c.metricsLevel.IsContainerEnabled() {
231+
c.collectContainerMetrics(ch, snapshot.Containers)
232+
}
233+
234+
if c.metricsLevel.IsVMEnabled() {
235+
c.collectVMMetrics(ch, snapshot.VirtualMachines)
236+
}
237+
238+
if c.metricsLevel.IsPodEnabled() {
239+
c.collectPodMetrics(ch, snapshot.Pods)
240+
}
215241
}
216242

217243
// collectNodeMetrics collects node-level power metrics

0 commit comments

Comments
 (0)