Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cpu: Add a 2nd label 'package' to metric node_cpu_core_throttles_total #871

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@

**Breaking changes**

This release contains more breaking changes:
* Rename label `node` of metric `node_cpu_package_throttles_total` to `package`.
* Add 2nd label `package` to metric `node_cpu_core_throttles_total`.

* [CHANGE]
* [FEATURE]
* [ENHANCEMENT]
* [BUGFIX]
* [BUGFIX] Count core throttles per core and per package #871

## 0.16.0-rc.1 / 2018-04-04

Expand Down
112 changes: 57 additions & 55 deletions collector/cpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ package collector

import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"regexp"
Expand Down Expand Up @@ -74,12 +73,12 @@ func NewCPUCollector() (Collector, error) {
cpuCoreThrottle: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "core_throttles_total"),
"Number of times this cpu core has been throttled.",
[]string{"core"}, nil,
[]string{"package", "core"}, nil,
),
cpuPackageThrottle: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "package_throttles_total"),
"Number of times this cpu package has been throttled.",
[]string{"node"}, nil,
[]string{"package"}, nil,
),
}, nil
}
Expand All @@ -95,16 +94,16 @@ func (c *cpuCollector) Update(ch chan<- prometheus.Metric) error {
return nil
}

// updateCPUfreq reads /sys/bus/cpu/devices/cpu* and expose cpu frequency statistics.
// updateCPUfreq reads /sys/devices/system/cpu/cpu* and expose cpu frequency statistics.
func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error {
cpus, err := filepath.Glob(sysFilePath("bus/cpu/devices/cpu[0-9]*"))
cpus, err := filepath.Glob(sysFilePath("devices/system/cpu/cpu[0-9]*"))
if err != nil {
return err
}

var value uint64

cpu_core_throttles := make(map[int]uint64)
packageThrottles := make(map[uint64]uint64)
packageCoreThrottles := make(map[uint64]map[uint64]uint64)

// cpu loop
for _, cpu := range cpus {
Expand Down Expand Up @@ -132,66 +131,69 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error {
ch <- prometheus.MustNewConstMetric(c.cpuFreqMax, prometheus.GaugeValue, float64(value)*1000.0, cpuNum)
}

if _, err := os.Stat(filepath.Join(cpu, "thermal_throttle")); os.IsNotExist(err) {
log.Debugf("CPU %v is missing thermal_throttle", cpu)
// See
// https://www.kernel.org/doc/Documentation/x86/topology.txt
// https://www.kernel.org/doc/Documentation/cputopology.txt
// https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu
var err error
var physicalPackageID, coreID uint64

// topology/physical_package_id
if physicalPackageID, err = readUintFromFile(filepath.Join(cpu, "topology", "physical_package_id")); err != nil {
log.Debugf("CPU %v is missing physical_package_id", cpu)
continue
}
// topology/core_id
if coreID, err = readUintFromFile(filepath.Join(cpu, "topology", "core_id")); err != nil {
log.Debugf("CPU %v is missing core_id", cpu)
continue
}

if value, err := readUintFromFile(filepath.Join(cpu, "topology/core_id")); err != nil {
log.Debugf("CPU %v is misssing topology/core_id", cpu)
} else {
core_id := int(value)
if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle", "core_throttle_count")); err != nil {
return err
// metric node_cpu_core_throttles_total
//
// We process this metric before the package throttles as there
// are cpu+kernel combinations that only present core throttles
// but no package throttles.
// Seen e.g. on an Intel Xeon E5472 system with RHEL 6.9 kernel.
if _, present := packageCoreThrottles[physicalPackageID]; !present {
packageCoreThrottles[physicalPackageID] = make(map[uint64]uint64)
}
if _, present := packageCoreThrottles[physicalPackageID][coreID]; !present {
// Read thermal_throttle/core_throttle_count only once
if coreThrottleCount, err := readUintFromFile(filepath.Join(cpu, "thermal_throttle", "core_throttle_count")); err == nil {
packageCoreThrottles[physicalPackageID][coreID] = coreThrottleCount
} else {
log.Debugf("CPU %v is missing core_throttle_count", cpu)
}
cpu_core_throttles[core_id] = value
}
}

// core throttles
for core_id, value := range cpu_core_throttles {
ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, prometheus.CounterValue, float64(value), strconv.Itoa(core_id))
// metric node_cpu_package_throttles_total
if _, present := packageThrottles[physicalPackageID]; !present {
// Read thermal_throttle/package_throttle_count only once
if packageThrottleCount, err := readUintFromFile(filepath.Join(cpu, "thermal_throttle", "package_throttle_count")); err == nil {
packageThrottles[physicalPackageID] = packageThrottleCount
} else {
log.Debugf("CPU %v is missing package_throttle_count", cpu)
}
}
}

nodes, err := filepath.Glob(sysFilePath("bus/node/devices/node[0-9]*"))
if err != nil {
return err
for physicalPackageID, packageThrottleCount := range packageThrottles {
ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle,
prometheus.CounterValue,
float64(packageThrottleCount),
strconv.FormatUint(physicalPackageID, 10))
}

// package / NUMA node loop
for _, node := range nodes {
if _, err := os.Stat(filepath.Join(node, "cpulist")); os.IsNotExist(err) {
log.Debugf("NUMA node %v is missing cpulist", node)
continue
}
cpulist, err := ioutil.ReadFile(filepath.Join(node, "cpulist"))
if err != nil {
log.Debugf("could not read cpulist of NUMA node %v", node)
return err
}
// cpulist example of one package/node with HT: "0-11,24-35"
line := strings.Split(string(cpulist), "\n")[0]
if line == "" {
// Skip processor-less (memory-only) NUMA nodes.
// E.g. RAM expansion with Intel Optane Drive(s) using
// Intel Memory Drive Technology (IMDT).
log.Debugf("skipping processor-less (memory-only) NUMA node %v", node)
continue
}
firstCPU := strings.FieldsFunc(line, func(r rune) bool {
return r == '-' || r == ','
})[0]
if _, err := os.Stat(filepath.Join(node, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); os.IsNotExist(err) {
log.Debugf("Node %v CPU %v is missing package_throttle", node, firstCPU)
continue
for physicalPackageID, core_map := range packageCoreThrottles {
for coreID, coreThrottleCount := range core_map {
ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle,
prometheus.CounterValue,
float64(coreThrottleCount),
strconv.FormatUint(physicalPackageID, 10),
strconv.FormatUint(coreID, 10))
}
if value, err = readUintFromFile(filepath.Join(node, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); err != nil {
return err
}
nodeno := digitRegexp.FindAllString(node, 1)[0]
ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), nodeno)
}

return nil
}

Expand Down
47 changes: 44 additions & 3 deletions collector/fixtures/e2e-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,22 +180,27 @@ node_buddyinfo_blocks{node="0",size="9",zone="Normal"} 0
node_context_switches_total 3.8014093e+07
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
# TYPE node_cpu_core_throttles_total counter
node_cpu_core_throttles_total{core="0"} 5
node_cpu_core_throttles_total{core="1"} 0
node_cpu_core_throttles_total{core="0",package="0"} 5
node_cpu_core_throttles_total{core="0",package="1"} 0
node_cpu_core_throttles_total{core="1",package="0"} 0
node_cpu_core_throttles_total{core="1",package="1"} 9
# HELP node_cpu_frequency_hertz Current cpu thread frequency in hertz.
# TYPE node_cpu_frequency_hertz gauge
node_cpu_frequency_hertz{cpu="0"} 1.699981e+09
node_cpu_frequency_hertz{cpu="1"} 1.699981e+09
node_cpu_frequency_hertz{cpu="2"} 8e+06
node_cpu_frequency_hertz{cpu="3"} 8e+06
# HELP node_cpu_frequency_max_hertz Maximum cpu thread frequency in hertz.
# TYPE node_cpu_frequency_max_hertz gauge
node_cpu_frequency_max_hertz{cpu="0"} 3.7e+09
node_cpu_frequency_max_hertz{cpu="1"} 3.7e+09
node_cpu_frequency_max_hertz{cpu="2"} 4.2e+09
node_cpu_frequency_max_hertz{cpu="3"} 4.2e+09
# HELP node_cpu_frequency_min_hertz Minimum cpu thread frequency in hertz.
# TYPE node_cpu_frequency_min_hertz gauge
node_cpu_frequency_min_hertz{cpu="0"} 8e+08
node_cpu_frequency_min_hertz{cpu="1"} 8e+08
node_cpu_frequency_min_hertz{cpu="2"} 1e+06
node_cpu_frequency_min_hertz{cpu="3"} 1e+06
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
Expand All @@ -217,7 +222,8 @@ node_cpu_guest_seconds_total{cpu="7",mode="nice"} 0.08
node_cpu_guest_seconds_total{cpu="7",mode="user"} 0.09
# HELP node_cpu_package_throttles_total Number of times this cpu package has been throttled.
# TYPE node_cpu_package_throttles_total counter
node_cpu_package_throttles_total{node="0"} 30
node_cpu_package_throttles_total{package="0"} 30
node_cpu_package_throttles_total{package="1"} 6
# HELP node_cpu_seconds_total Seconds the cpus spent in each mode.
# TYPE node_cpu_seconds_total counter
node_cpu_seconds_total{cpu="0",mode="idle"} 10870.69
Expand Down Expand Up @@ -1219,142 +1225,177 @@ node_memory_Writeback_bytes 0
# TYPE node_memory_numa_Active gauge
node_memory_numa_Active{node="0"} 5.58733312e+09
node_memory_numa_Active{node="1"} 5.739003904e+09
node_memory_numa_Active{node="2"} 5.739003904e+09
# HELP node_memory_numa_Active_anon Memory information field Active_anon.
# TYPE node_memory_numa_Active_anon gauge
node_memory_numa_Active_anon{node="0"} 7.07915776e+08
node_memory_numa_Active_anon{node="1"} 6.04635136e+08
node_memory_numa_Active_anon{node="2"} 6.04635136e+08
# HELP node_memory_numa_Active_file Memory information field Active_file.
# TYPE node_memory_numa_Active_file gauge
node_memory_numa_Active_file{node="0"} 4.879417344e+09
node_memory_numa_Active_file{node="1"} 5.134368768e+09
node_memory_numa_Active_file{node="2"} 5.134368768e+09
# HELP node_memory_numa_AnonHugePages Memory information field AnonHugePages.
# TYPE node_memory_numa_AnonHugePages gauge
node_memory_numa_AnonHugePages{node="0"} 1.50994944e+08
node_memory_numa_AnonHugePages{node="1"} 9.2274688e+07
node_memory_numa_AnonHugePages{node="2"} 9.2274688e+07
# HELP node_memory_numa_AnonPages Memory information field AnonPages.
# TYPE node_memory_numa_AnonPages gauge
node_memory_numa_AnonPages{node="0"} 8.07112704e+08
node_memory_numa_AnonPages{node="1"} 6.88058368e+08
node_memory_numa_AnonPages{node="2"} 6.88058368e+08
# HELP node_memory_numa_Bounce Memory information field Bounce.
# TYPE node_memory_numa_Bounce gauge
node_memory_numa_Bounce{node="0"} 0
node_memory_numa_Bounce{node="1"} 0
node_memory_numa_Bounce{node="2"} 0
# HELP node_memory_numa_Dirty Memory information field Dirty.
# TYPE node_memory_numa_Dirty gauge
node_memory_numa_Dirty{node="0"} 20480
node_memory_numa_Dirty{node="1"} 122880
node_memory_numa_Dirty{node="2"} 122880
# HELP node_memory_numa_FilePages Memory information field FilePages.
# TYPE node_memory_numa_FilePages gauge
node_memory_numa_FilePages{node="0"} 7.1855017984e+10
node_memory_numa_FilePages{node="1"} 8.5585088512e+10
node_memory_numa_FilePages{node="2"} 8.5585088512e+10
# HELP node_memory_numa_HugePages_Free Memory information field HugePages_Free.
# TYPE node_memory_numa_HugePages_Free gauge
node_memory_numa_HugePages_Free{node="0"} 0
node_memory_numa_HugePages_Free{node="1"} 0
node_memory_numa_HugePages_Free{node="2"} 0
# HELP node_memory_numa_HugePages_Surp Memory information field HugePages_Surp.
# TYPE node_memory_numa_HugePages_Surp gauge
node_memory_numa_HugePages_Surp{node="0"} 0
node_memory_numa_HugePages_Surp{node="1"} 0
node_memory_numa_HugePages_Surp{node="2"} 0
# HELP node_memory_numa_HugePages_Total Memory information field HugePages_Total.
# TYPE node_memory_numa_HugePages_Total gauge
node_memory_numa_HugePages_Total{node="0"} 0
node_memory_numa_HugePages_Total{node="1"} 0
node_memory_numa_HugePages_Total{node="2"} 0
# HELP node_memory_numa_Inactive Memory information field Inactive.
# TYPE node_memory_numa_Inactive gauge
node_memory_numa_Inactive{node="0"} 6.0569788416e+10
node_memory_numa_Inactive{node="1"} 7.3165406208e+10
node_memory_numa_Inactive{node="2"} 7.3165406208e+10
# HELP node_memory_numa_Inactive_anon Memory information field Inactive_anon.
# TYPE node_memory_numa_Inactive_anon gauge
node_memory_numa_Inactive_anon{node="0"} 3.48626944e+08
node_memory_numa_Inactive_anon{node="1"} 2.91930112e+08
node_memory_numa_Inactive_anon{node="2"} 2.91930112e+08
# HELP node_memory_numa_Inactive_file Memory information field Inactive_file.
# TYPE node_memory_numa_Inactive_file gauge
node_memory_numa_Inactive_file{node="0"} 6.0221161472e+10
node_memory_numa_Inactive_file{node="1"} 7.2873476096e+10
node_memory_numa_Inactive_file{node="2"} 7.2873476096e+10
# HELP node_memory_numa_KernelStack Memory information field KernelStack.
# TYPE node_memory_numa_KernelStack gauge
node_memory_numa_KernelStack{node="0"} 3.4832384e+07
node_memory_numa_KernelStack{node="1"} 3.1850496e+07
node_memory_numa_KernelStack{node="2"} 3.1850496e+07
# HELP node_memory_numa_Mapped Memory information field Mapped.
# TYPE node_memory_numa_Mapped gauge
node_memory_numa_Mapped{node="0"} 9.1570176e+08
node_memory_numa_Mapped{node="1"} 8.84850688e+08
node_memory_numa_Mapped{node="2"} 8.84850688e+08
# HELP node_memory_numa_MemFree Memory information field MemFree.
# TYPE node_memory_numa_MemFree gauge
node_memory_numa_MemFree{node="0"} 5.4303100928e+10
node_memory_numa_MemFree{node="1"} 4.0586022912e+10
node_memory_numa_MemFree{node="2"} 4.0586022912e+10
# HELP node_memory_numa_MemTotal Memory information field MemTotal.
# TYPE node_memory_numa_MemTotal gauge
node_memory_numa_MemTotal{node="0"} 1.3740271616e+11
node_memory_numa_MemTotal{node="1"} 1.37438953472e+11
node_memory_numa_MemTotal{node="2"} 1.37438953472e+11
# HELP node_memory_numa_MemUsed Memory information field MemUsed.
# TYPE node_memory_numa_MemUsed gauge
node_memory_numa_MemUsed{node="0"} 8.3099615232e+10
node_memory_numa_MemUsed{node="1"} 9.685293056e+10
node_memory_numa_MemUsed{node="2"} 9.685293056e+10
# HELP node_memory_numa_Mlocked Memory information field Mlocked.
# TYPE node_memory_numa_Mlocked gauge
node_memory_numa_Mlocked{node="0"} 0
node_memory_numa_Mlocked{node="1"} 0
node_memory_numa_Mlocked{node="2"} 0
# HELP node_memory_numa_NFS_Unstable Memory information field NFS_Unstable.
# TYPE node_memory_numa_NFS_Unstable gauge
node_memory_numa_NFS_Unstable{node="0"} 0
node_memory_numa_NFS_Unstable{node="1"} 0
node_memory_numa_NFS_Unstable{node="2"} 0
# HELP node_memory_numa_PageTables Memory information field PageTables.
# TYPE node_memory_numa_PageTables gauge
node_memory_numa_PageTables{node="0"} 1.46743296e+08
node_memory_numa_PageTables{node="1"} 1.27254528e+08
node_memory_numa_PageTables{node="2"} 1.27254528e+08
# HELP node_memory_numa_SReclaimable Memory information field SReclaimable.
# TYPE node_memory_numa_SReclaimable gauge
node_memory_numa_SReclaimable{node="0"} 4.580478976e+09
node_memory_numa_SReclaimable{node="1"} 4.724822016e+09
node_memory_numa_SReclaimable{node="2"} 4.724822016e+09
# HELP node_memory_numa_SUnreclaim Memory information field SUnreclaim.
# TYPE node_memory_numa_SUnreclaim gauge
node_memory_numa_SUnreclaim{node="0"} 2.23352832e+09
node_memory_numa_SUnreclaim{node="1"} 2.464391168e+09
node_memory_numa_SUnreclaim{node="2"} 2.464391168e+09
# HELP node_memory_numa_Shmem Memory information field Shmem.
# TYPE node_memory_numa_Shmem gauge
node_memory_numa_Shmem{node="0"} 4.900864e+07
node_memory_numa_Shmem{node="1"} 8.968192e+07
node_memory_numa_Shmem{node="2"} 8.968192e+07
# HELP node_memory_numa_Slab Memory information field Slab.
# TYPE node_memory_numa_Slab gauge
node_memory_numa_Slab{node="0"} 6.814007296e+09
node_memory_numa_Slab{node="1"} 7.189213184e+09
node_memory_numa_Slab{node="2"} 7.189213184e+09
# HELP node_memory_numa_Unevictable Memory information field Unevictable.
# TYPE node_memory_numa_Unevictable gauge
node_memory_numa_Unevictable{node="0"} 0
node_memory_numa_Unevictable{node="1"} 0
node_memory_numa_Unevictable{node="2"} 0
# HELP node_memory_numa_Writeback Memory information field Writeback.
# TYPE node_memory_numa_Writeback gauge
node_memory_numa_Writeback{node="0"} 0
node_memory_numa_Writeback{node="1"} 0
node_memory_numa_Writeback{node="2"} 0
# HELP node_memory_numa_WritebackTmp Memory information field WritebackTmp.
# TYPE node_memory_numa_WritebackTmp gauge
node_memory_numa_WritebackTmp{node="0"} 0
node_memory_numa_WritebackTmp{node="1"} 0
node_memory_numa_WritebackTmp{node="2"} 0
# HELP node_memory_numa_interleave_hit_total Memory information field interleave_hit_total.
# TYPE node_memory_numa_interleave_hit_total counter
node_memory_numa_interleave_hit_total{node="0"} 57146
node_memory_numa_interleave_hit_total{node="1"} 57286
node_memory_numa_interleave_hit_total{node="2"} 7286
# HELP node_memory_numa_local_node_total Memory information field local_node_total.
# TYPE node_memory_numa_local_node_total counter
node_memory_numa_local_node_total{node="0"} 1.93454780853e+11
node_memory_numa_local_node_total{node="1"} 3.2671904655e+11
node_memory_numa_local_node_total{node="2"} 2.671904655e+10
# HELP node_memory_numa_numa_foreign_total Memory information field numa_foreign_total.
# TYPE node_memory_numa_numa_foreign_total counter
node_memory_numa_numa_foreign_total{node="0"} 5.98586233e+10
node_memory_numa_numa_foreign_total{node="1"} 1.2624528e+07
node_memory_numa_numa_foreign_total{node="2"} 2.624528e+06
# HELP node_memory_numa_numa_hit_total Memory information field numa_hit_total.
# TYPE node_memory_numa_numa_hit_total counter
node_memory_numa_numa_hit_total{node="0"} 1.93460335812e+11
node_memory_numa_numa_hit_total{node="1"} 3.26720946761e+11
node_memory_numa_numa_hit_total{node="2"} 2.6720946761e+10
# HELP node_memory_numa_numa_miss_total Memory information field numa_miss_total.
# TYPE node_memory_numa_numa_miss_total counter
node_memory_numa_numa_miss_total{node="0"} 1.2624528e+07
node_memory_numa_numa_miss_total{node="1"} 5.9858626709e+10
node_memory_numa_numa_miss_total{node="2"} 9.858626709e+09
# HELP node_memory_numa_other_node_total Memory information field other_node_total.
# TYPE node_memory_numa_other_node_total counter
node_memory_numa_other_node_total{node="0"} 1.8179487e+07
node_memory_numa_other_node_total{node="1"} 5.986052692e+10
node_memory_numa_other_node_total{node="2"} 9.86052692e+09
# HELP node_mountstats_nfs_age_seconds_total The age of the NFS mount in seconds.
# TYPE node_mountstats_nfs_age_seconds_total counter
node_mountstats_nfs_age_seconds_total{export="192.168.1.1:/srv/test"} 13968
Expand Down
Loading