Quellcode durchsuchen

Merge pull request #26 from bobtiji/main

Adding DCGM and Nvidia-SMI to list of exporters
Christian vor 4 Jahren
Ursprung
Commit
d6ce5d372b

+ 35 - 0
docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md

@@ -0,0 +1,35 @@
+# Prerequisite
+
+    NVIDIA container toolkit
+        sudo apt -y install build-essential nvidia-cuda-toolkit nvidia-headless-495 nvidia-utils-495 libnvidia-encode-495 \
+            && distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
+            && curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - \
+            && curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list \
+            && sudo apt update \
+            && sudo apt -y install nvidia-container-toolkit nvidia-container-runtime nvidia-docker2 
+
+
+    DCGM on host machine running Nvidia GPU 
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin \
+            && sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 \
+            && sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub \
+            && sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" \
+            && sudo apt update \
+            && sudo apt install -y datacenter-gpu-manager \
+            && sudo systemctl --now enable nvidia-dcgm
+
+## Deployment
+
+1. Modify the prometheus configuration template  `/etc/prometheus/prometheus.yml` location.
+# Job for Nvidia DCGM exporter in prometheus config file
+        - job_name: 'nvidia_exporter'
+          static_configs:
+            - targets: ['nvidia_exporter:9400'] # if nvidia_exporter container is not on same docker network , change this line to "- targets: ['whichever ip your host is:9400']"
+
+# Additional Referfences
+[Official DCGM Documentations](https://github.com/NVIDIA/DCGM)
+[Nvidia container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide)
+[Nvidia DCGM exporter Documentation](https://github.com/NVIDIA/dcgm-exporter)
+[Nvidia DCGM exporter Documentation-2](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/dcgm-exporter.html)
+[Official Prometheus Documentation](https://prometheus.io/docs/introduction/overview/)
+[Some grafana dashboard, not perfect, old, but configurable](https://grafana.com/grafana/dashboards/11578)

+ 21 - 0
docker-compose/prometheus/exporters/Nvidia DGCM exporter/docker-compose.yml

@@ -0,0 +1,21 @@
+---
+version: '3'
+services:
+  
+  nvidia_exporter: #to export data from DCGM host, need DCGM installed of an equal or newer version to the container on the host system https://github.com/NVIDIA/DCGM
+    image: nvcr.io/nvidia/k8s/dcgm-exporter:2.3.2-2.6.2-ubuntu20.04
+    container_name: nvidia_exporter
+    runtime: nvidia
+    cap_add:
+      - SYS_ADMIN
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=all
+    ports:
+      - "9400:9400"
+    restart: unless-stopped
+
+    #NVIDIA Data Center GPU Manager: To export data from DCGM host to prometheus, you need DCGM installed on host as well as nvidia container toolkit
+    #https://github.com/NVIDIA/DCGM
+    #https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide
+    #https://github.com/NVIDIA/dcgm-exporter and https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/dcgm-exporter.html

+ 23 - 0
docker-compose/prometheus/exporters/Nvidia_smi_exporter/README.md

@@ -0,0 +1,23 @@
+# Prerequisite
+
+    NVIDIA container toolkit
+        sudo apt -y install build-essential nvidia-cuda-toolkit nvidia-headless-495 nvidia-utils-495 libnvidia-encode-495 \
+            && distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
+            && curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - \
+            && curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list \
+            && sudo apt update \
+            && sudo apt -y install nvidia-container-toolkit nvidia-container-runtime nvidia-docker2 
+
+## Deployment
+
+1. Modify the prometheus configuration template  `/etc/prometheus/prometheus.yml` location.
+# Job for Nvidia SMI exporter in prometheus config file
+        - job_name: 'nvidia_smi_exporter'
+          static_configs:
+            - targets: ['nvidia_smi_exporter:9835'] # if nvidia_smi_exporter container is not on same docker network , change this line to "- targets: ['whichever ip your host is:9835']"
+
+# Additional Referfences
+[Nvidia container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide)
+[Nvidia GPU exporter Documentation](https://github.com/utkuozdemir/nvidia_gpu_exporter)
+[Official Prometheus Documentation](https://prometheus.io/docs/introduction/overview/)
+[Some grafana dashboard, not perfect, old, but configurable](https://grafana.com/grafana/dashboards/14574)

+ 19 - 0
docker-compose/prometheus/exporters/Nvidia_smi_exporter/docker-compose.yml

@@ -0,0 +1,19 @@
+---
+version: '3'
+services:
+  nvidia_smi_exporter: #To export data from nvidia-smi, needs nvidia-smi and nvidia-container-toolkit installed on host.
+    image: utkuozdemir/nvidia_gpu_exporter:0.3.0
+    container_name: nvidia_smi_exporter
+    runtime: nvidia
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=all
+    ports:
+      - "9835:9835"
+    volumes:
+      - /usr/bin/nvidia-smi:/usr/bin/nvidia-smi
+      - /usr/lib/x86_64-linux-gnu/libnvidia-ml.so:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so
+      - /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1
+    restart: unless-stopped
+
+#Mount points for volume work on Ubuntu 20.04