From 3046f957d50abb6ff8a9e768c3d93a751e940985 Mon Sep 17 00:00:00 2001
From: Jack Zampolin <jack.zampolin@gmail.com>
Date: Tue, 17 Apr 2018 13:40:55 -0700
Subject: [PATCH] Add nvidia_smi input to monitor nvidia GPUs (#4026)

---
 plugins/inputs/all/all.go                    |   1 +
 plugins/inputs/nvidia_smi/README.md          |  47 ++++++
 plugins/inputs/nvidia_smi/nvidia_smi.go      | 149 +++++++++++++++++++
 plugins/inputs/nvidia_smi/nvidia_smi_test.go |  35 +++++
 4 files changed, 232 insertions(+)
 create mode 100644 plugins/inputs/nvidia_smi/README.md
 create mode 100644 plugins/inputs/nvidia_smi/nvidia_smi.go
 create mode 100644 plugins/inputs/nvidia_smi/nvidia_smi_test.go

diff --git a/plugins/inputs/all/all.go b/plugins/inputs/all/all.go
index e3264ef8..440d1e9a 100644
--- a/plugins/inputs/all/all.go
+++ b/plugins/inputs/all/all.go
@@ -65,6 +65,7 @@ import (
 	_ "github.com/influxdata/telegraf/plugins/inputs/nsq_consumer"
 	_ "github.com/influxdata/telegraf/plugins/inputs/nstat"
 	_ "github.com/influxdata/telegraf/plugins/inputs/ntpq"
+	_ "github.com/influxdata/telegraf/plugins/inputs/nvidia_smi"
 	_ "github.com/influxdata/telegraf/plugins/inputs/openldap"
 	_ "github.com/influxdata/telegraf/plugins/inputs/opensmtpd"
 	_ "github.com/influxdata/telegraf/plugins/inputs/passenger"
diff --git a/plugins/inputs/nvidia_smi/README.md b/plugins/inputs/nvidia_smi/README.md
new file mode 100644
index 00000000..84b8527f
--- /dev/null
+++ b/plugins/inputs/nvidia_smi/README.md
@@ -0,0 +1,47 @@
+# `nvidia-smi` Input Plugin
+
+This plugin uses a query on the [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) binary to pull GPU stats including memory and GPU usage, temp and other.
+
+### Configuration
+
+```toml
+# Pulls statistics from nvidia GPUs attached to the host
+[[inputs.nvidia_smi]]
+## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath
+# bin_path = /usr/bin/nvidia-smi
+
+## Optional: timeout for GPU polling
+# timeout = 5s
+```
+
+### Metrics
+- measurement: `nvidia_smi`
+  - tags
+    - `name` (type of GPU e.g. `GeForce GTX 170 Ti`)
+    - `compute_mode` (The compute mode of the GPU e.g. `Default`)
+    - `index` (The port index where the GPU is connected to the motherboard e.g. `1`)
+    - `pstate` (Overclocking state for the GPU e.g. `P0`)
+    - `uuid` (A unique identifier for the GPU e.g. `GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665`)
+  - fields
+    - `fan_speed` (integer, percentage)
+    - `memory_free` (integer, KB)
+    - `memory_used` (integer, KB)
+    - `memory_total` (integer, KB)
+    - `temperature_gpu` (integer, degrees C)
+    - `utilization_gpu` (integer, percentage)
+    - `utilization_memory` (integer, percentage)
+
+### Sample Query
+
+The below query could be used to alert on the average temperature of the your GPUs over the last minute
+
+```
+SELECT mean("temperature_gpu") FROM "nvidia_smi" WHERE time > now() - 5m GROUP BY time(1m), "index", "name", "host"
+```
+
+### Example Output
+```
+nvidia_smi,compute_mode=Default,host=8218cf,index=0,name=GeForce\ GTX\ 1070,pstate=P2,uuid=GPU-823bc202-6279-6f2c-d729-868a30f14d96 fan_speed=100i,memory_free=7563i,memory_total=8112i,memory_used=549i,temperature_gpu=53i,utilization_gpu=100i,utilization_memory=90i 1523991122000000000
+nvidia_smi,compute_mode=Default,host=8218cf,index=1,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665 fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=50i,utilization_gpu=100i,utilization_memory=85i 1523991122000000000
+nvidia_smi,compute_mode=Default,host=8218cf,index=2,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-d4cfc28d-0481-8d07-b81a-ddfc63d74adf fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=58i,utilization_gpu=100i,utilization_memory=86i 1523991122000000000
+```
diff --git a/plugins/inputs/nvidia_smi/nvidia_smi.go b/plugins/inputs/nvidia_smi/nvidia_smi.go
new file mode 100644
index 00000000..0cf9bd9e
--- /dev/null
+++ b/plugins/inputs/nvidia_smi/nvidia_smi.go
@@ -0,0 +1,149 @@
+package nvidia_smi
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"os/exec"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/influxdata/telegraf"
+	"github.com/influxdata/telegraf/internal"
+	"github.com/influxdata/telegraf/plugins/inputs"
+)
+
+var (
+	measurement = "nvidia_smi"
+	metrics     = "fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index"
+	metricNames = [][]string{
+		[]string{"fan_speed", "field"},
+		[]string{"memory_total", "field"},
+		[]string{"memory_used", "field"},
+		[]string{"memory_free", "field"},
+		[]string{"pstate", "tag"},
+		[]string{"temperature_gpu", "field"},
+		[]string{"name", "tag"},
+		[]string{"uuid", "tag"},
+		[]string{"compute_mode", "tag"},
+		[]string{"utilization_gpu", "field"},
+		[]string{"utilization_memory", "field"},
+		[]string{"index", "tag"},
+	}
+)
+
+// NvidiaSMI holds the methods for this plugin
+type NvidiaSMI struct {
+	BinPath string
+	Timeout time.Duration
+
+	metrics string
+}
+
+// Description returns the description of the NvidiaSMI plugin
+func (smi *NvidiaSMI) Description() string {
+	return "Pulls statistics from nvidia GPUs attached to the host"
+}
+
+// SampleConfig returns the sample configuration for the NvidiaSMI plugin
+func (smi *NvidiaSMI) SampleConfig() string {
+	return `
+## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath
+# bin_path = /usr/bin/nvidia-smi
+
+## Optional: timeout for GPU polling
+# timeout = 5s
+`
+}
+
+// Gather implements the telegraf interface
+func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
+
+	if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
+		return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath)
+	}
+
+	data, err := smi.pollSMI()
+	if err != nil {
+		return err
+	}
+
+	err = gatherNvidiaSMI(data, acc)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func init() {
+	inputs.Add("nvidia_smi", func() telegraf.Input {
+		return &NvidiaSMI{
+			BinPath: "/usr/bin/nvidia-smi",
+			Timeout: 5 * time.Second,
+			metrics: metrics,
+		}
+	})
+}
+
+func (smi *NvidiaSMI) pollSMI() (string, error) {
+	// Construct and execute metrics query
+	opts := []string{"--format=noheader,nounits,csv", fmt.Sprintf("--query-gpu=%s", smi.metrics)}
+	ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, opts...), smi.Timeout)
+	if err != nil {
+		return "", err
+	}
+	return string(ret), nil
+}
+
+func gatherNvidiaSMI(ret string, acc telegraf.Accumulator) error {
+	// First split the lines up and handle each one
+	scanner := bufio.NewScanner(strings.NewReader(ret))
+	for scanner.Scan() {
+		tags, fields, err := parseLine(scanner.Text())
+		if err != nil {
+			return err
+		}
+		acc.AddFields(measurement, fields, tags)
+	}
+
+	if err := scanner.Err(); err != nil {
+		return fmt.Errorf("Error scanning text %s", ret)
+	}
+
+	return nil
+}
+
+func parseLine(line string) (map[string]string, map[string]interface{}, error) {
+	tags := make(map[string]string, 0)
+	fields := make(map[string]interface{}, 0)
+
+	// Next split up the comma delimited metrics
+	met := strings.Split(line, ",")
+
+	// Make sure there are as many metrics in the line as there were queried.
+	if len(met) == len(metricNames) {
+		for i, m := range metricNames {
+
+			// First handle the tags
+			if m[1] == "tag" {
+				tags[m[0]] = strings.TrimSpace(met[i])
+				continue
+			}
+
+			// Then parse the integers out of the fields
+			out, err := strconv.ParseInt(strings.TrimSpace(met[i]), 10, 64)
+			if err != nil {
+				return tags, fields, err
+			}
+			fields[m[0]] = out
+		}
+
+		// Return the tags and fields
+		return tags, fields, nil
+	}
+
+	// If the line is empty return an emptyline error
+	return tags, fields, fmt.Errorf("Different number of metrics returned (%d) than expeced (%d)", len(met), len(metricNames))
+}
diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go
new file mode 100644
index 00000000..62ddee3b
--- /dev/null
+++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go
@@ -0,0 +1,35 @@
+package nvidia_smi
+
+import (
+	"testing"
+)
+
+func TestParseLineStandard(t *testing.T) {
+	line := "85, 8114, 553, 7561, P2, 61, GeForce GTX 1070 Ti, GPU-d1911b8a-f5c8-5e66-057c-486561269de8, Default, 100, 93, 1\n"
+	tags, fields, err := parseLine(line)
+	if err != nil {
+		t.Fail()
+	}
+	if tags["name"] != "GeForce GTX 1070 Ti" {
+		t.Fail()
+	}
+	if temp, ok := fields["temperature_gpu"].(int); ok && temp == 61 {
+		t.Fail()
+	}
+}
+
+func TestParseLineEmptyLine(t *testing.T) {
+	line := "\n"
+	_, _, err := parseLine(line)
+	if err == nil {
+		t.Fail()
+	}
+}
+
+func TestParseLineBad(t *testing.T) {
+	line := "the quick brown fox jumped over the lazy dog"
+	_, _, err := parseLine(line)
+	if err == nil {
+		t.Fail()
+	}
+}
-- 
GitLab