From e725e4e1d756aca4289d61c3691246c14ccba97e Mon Sep 17 00:00:00 2001 From: Enes Cakir Date: Thu, 9 May 2024 10:08:56 +0300 Subject: [PATCH] Add GPU runner image template Add nvidia-cuda-toolkit to the GPU image (#2) Add nvidia-cuda-toolkit and cuda-drivers cuda-12-4 add cudnn add cudnn to env just install cudnn --- .github/workflows/ubicloud-image.yml | 3 + .../ubicloud/nvidia-container-toolkit.sh | 24 + .../ubuntu/scripts/ubicloud/nvidia-driver.sh | 21 + .../ubuntu/templates/ubuntu-22.04.gpu.pkr.hcl | 497 ++++++++++++++++++ 4 files changed, 545 insertions(+) create mode 100755 images/ubuntu/scripts/ubicloud/nvidia-container-toolkit.sh create mode 100755 images/ubuntu/scripts/ubicloud/nvidia-driver.sh create mode 100644 images/ubuntu/templates/ubuntu-22.04.gpu.pkr.hcl diff --git a/.github/workflows/ubicloud-image.yml b/.github/workflows/ubicloud-image.yml index c9090c4c3a0c..b7bbae0075d3 100644 --- a/.github/workflows/ubicloud-image.yml +++ b/.github/workflows/ubicloud-image.yml @@ -15,6 +15,7 @@ on: - ubuntu-20.04.arm64 - ubuntu-22.04 - ubuntu-22.04.arm64 + - ubuntu-22.04.gpu - ubuntu-minimal image_suffix: description: "Suffix for image name, if not provided the image version" @@ -64,6 +65,7 @@ jobs: "ubuntu-20.04.arm64" { 'Ubuntu2004-Readme.md' } "ubuntu-22.04" { 'Ubuntu2204-Readme.md' } "ubuntu-22.04.arm64" { 'Ubuntu2204-Readme.md' } + "ubuntu-22.04.gpu" { 'Ubuntu2204-Readme.md' } "ubuntu-minimal" { 'Ubuntu2204-Readme.md' } } "Readme_Name=$ReadmeName" >> $env:GITHUB_ENV @@ -250,6 +252,7 @@ jobs: "ubuntu-20.04.arm64") image_name="github-ubuntu-2004-arm64" ;; "ubuntu-22.04") image_name="github-ubuntu-2204-x64" ;; "ubuntu-22.04.arm64") image_name="github-ubuntu-2204-arm64" ;; + "ubuntu-22.04.gpu") image_name="github-gpu-ubuntu-2204-x64" ;; "ubuntu-minimal") image_name="github-ubuntu-minimal-x64" ;; "ubuntu-postgres") image_name="postgres-ubuntu-2204-x64" ;; esac diff --git a/images/ubuntu/scripts/ubicloud/nvidia-container-toolkit.sh b/images/ubuntu/scripts/ubicloud/nvidia-container-toolkit.sh new file mode 100755 index 000000000000..6120d7cde2ff --- /dev/null +++ b/images/ubuntu/scripts/ubicloud/nvidia-container-toolkit.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -euo pipefail + +# Inspired by: https://github.com/nv-gha-runners/vm-images/blob/main/linux/installers/nvidia-container-toolkit.sh + +KEYRING="/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg" +APT="/etc/apt/sources.list.d/nvidia-container-toolkit.list" + +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o "${KEYRING}" +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed "s#deb https://#deb [signed-by=${KEYRING}] https://#g" | \ + sudo tee "${APT}" + +sudo apt-get update + +sudo apt-get install -y --no-install-recommends nvidia-container-toolkit + +sudo rm -rf "${APT}" "${KEYRING}" + +# Add nvidia runtime to docker and set as default +sudo nvidia-ctk runtime configure --runtime docker --set-as-default + +sudo systemctl restart docker +docker info diff --git a/images/ubuntu/scripts/ubicloud/nvidia-driver.sh b/images/ubuntu/scripts/ubicloud/nvidia-driver.sh new file mode 100755 index 000000000000..0ae5b08394cf --- /dev/null +++ b/images/ubuntu/scripts/ubicloud/nvidia-driver.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -euo pipefail + +# Inspired by: https://github.com/nv-gha-runners/vm-images/blob/main/linux/installers/nvidia-driver.sh + +# Source the helpers for use with the script +source $HELPER_SCRIPTS/etc-environment.sh + +KEYRING=cuda-keyring_1.1-1_all.deb +ARCH=x86_64 +#NV_DRIVER_VERSION=550 +CUDA_TOOLKIT_VERSION=12-4 + +wget -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/${KEYRING}" +sudo dpkg --install "${KEYRING}" +sudo apt-get update + +sudo apt-get -y install "cuda-${CUDA_TOOLKIT_VERSION}" "cudnn9-cuda-12" +prepend_etc_environment_path "/usr/local/cuda/bin" + +sudo dpkg --purge "$(dpkg -f "${KEYRING}" Package)" diff --git a/images/ubuntu/templates/ubuntu-22.04.gpu.pkr.hcl b/images/ubuntu/templates/ubuntu-22.04.gpu.pkr.hcl new file mode 100644 index 000000000000..13c17ba91234 --- /dev/null +++ b/images/ubuntu/templates/ubuntu-22.04.gpu.pkr.hcl @@ -0,0 +1,497 @@ +packer { + required_plugins { + azure = { + source = "github.com/hashicorp/azure" + version = "1.4.5" + } + } +} + +locals { + managed_image_name = var.managed_image_name != "" ? var.managed_image_name : "packer-${var.image_os}-${var.image_version}" +} + +variable "allowed_inbound_ip_addresses" { + type = list(string) + default = [] +} + +variable "azure_tags" { + type = map(string) + default = {} +} + +variable "build_resource_group_name" { + type = string + default = "${env("BUILD_RESOURCE_GROUP_NAME")}" +} + +variable "client_cert_path" { + type = string + default = "${env("ARM_CLIENT_CERT_PATH")}" +} + +variable "client_id" { + type = string + default = "${env("ARM_CLIENT_ID")}" +} + +variable "client_secret" { + type = string + default = "${env("ARM_CLIENT_SECRET")}" + sensitive = true +} + +variable "dockerhub_login" { + type = string + default = "${env("DOCKERHUB_LOGIN")}" +} + +variable "dockerhub_password" { + type = string + default = "${env("DOCKERHUB_PASSWORD")}" +} + +variable "helper_script_folder" { + type = string + default = "/imagegeneration/helpers" +} + +variable "image_folder" { + type = string + default = "/imagegeneration" +} + +variable "image_os" { + type = string + default = "ubuntu22" +} + +variable "image_version" { + type = string + default = "dev" +} + +variable "imagedata_file" { + type = string + default = "/imagegeneration/imagedata.json" +} + +variable "installer_script_folder" { + type = string + default = "/imagegeneration/installers" +} + +variable "install_password" { + type = string + default = "" + sensitive = true +} + +variable "location" { + type = string + default = "${env("ARM_RESOURCE_LOCATION")}" +} + +variable "managed_image_name" { + type = string + default = "" +} + +variable "managed_image_resource_group_name" { + type = string + default = "${env("ARM_RESOURCE_GROUP")}" +} + +variable "private_virtual_network_with_public_ip" { + type = bool + default = false +} + +variable "subscription_id" { + type = string + default = "${env("ARM_SUBSCRIPTION_ID")}" +} + +variable "temp_resource_group_name" { + type = string + default = "${env("TEMP_RESOURCE_GROUP_NAME")}" +} + +variable "tenant_id" { + type = string + default = "${env("ARM_TENANT_ID")}" +} + +variable "virtual_network_name" { + type = string + default = "${env("VNET_NAME")}" +} + +variable "virtual_network_resource_group_name" { + type = string + default = "${env("VNET_RESOURCE_GROUP")}" +} + +variable "virtual_network_subnet_name" { + type = string + default = "${env("VNET_SUBNET")}" +} + +variable "vm_size" { + type = string + default = "Standard_D4s_v4" +} + +source "azure-arm" "build_image" { + allowed_inbound_ip_addresses = "${var.allowed_inbound_ip_addresses}" + build_resource_group_name = "${var.build_resource_group_name}" + client_cert_path = "${var.client_cert_path}" + client_id = "${var.client_id}" + client_secret = "${var.client_secret}" + image_offer = "0001-com-ubuntu-server-jammy" + image_publisher = "canonical" + image_sku = "22_04-lts" + location = "${var.location}" + managed_image_name = "${local.managed_image_name}" + managed_image_resource_group_name = "${var.managed_image_resource_group_name}" + os_disk_size_gb = "50" + os_type = "Linux" + private_virtual_network_with_public_ip = "${var.private_virtual_network_with_public_ip}" + subscription_id = "${var.subscription_id}" + temp_resource_group_name = "${var.temp_resource_group_name}" + tenant_id = "${var.tenant_id}" + virtual_network_name = "${var.virtual_network_name}" + virtual_network_resource_group_name = "${var.virtual_network_resource_group_name}" + virtual_network_subnet_name = "${var.virtual_network_subnet_name}" + vm_size = "${var.vm_size}" + + dynamic "azure_tag" { + for_each = var.azure_tags + content { + name = azure_tag.key + value = azure_tag.value + } + } +} + +build { + sources = ["source.azure-arm.build_image"] + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["mkdir ${var.image_folder}", "chmod 777 ${var.image_folder}"] + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + script = "${path.root}/../scripts/build/configure-apt-mock.sh" + } + + provisioner "shell" { + environment_vars = ["DEBIAN_FRONTEND=noninteractive"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = [ + "${path.root}/../scripts/build/install-ms-repos.sh", + "${path.root}/../scripts/build/configure-apt-sources.sh", + "${path.root}/../scripts/build/configure-apt.sh" + ] + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + script = "${path.root}/../scripts/build/configure-limits.sh" + } + + provisioner "file" { + destination = "${var.helper_script_folder}" + source = "${path.root}/../scripts/helpers" + } + + provisioner "file" { + destination = "${var.installer_script_folder}" + source = "${path.root}/../scripts/build" + } + + provisioner "file" { + destination = "${var.image_folder}" + sources = [ + "${path.root}/../assets/post-gen", + "${path.root}/../scripts/tests", + "${path.root}/../scripts/docs-gen" + ] + } + + provisioner "file" { + destination = "${var.image_folder}/docs-gen/" + source = "${path.root}/../../../helpers/software-report-base" + } + + provisioner "file" { + destination = "${var.installer_script_folder}/toolset.json" + source = "${path.root}/../toolsets/toolset-2204.json" + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = [ + "mv ${var.image_folder}/docs-gen ${var.image_folder}/SoftwareReport", + "mv ${var.image_folder}/post-gen ${var.image_folder}/post-generation" + ] + } + + provisioner "shell" { + environment_vars = ["IMAGE_VERSION=${var.image_version}", "IMAGEDATA_FILE=${var.imagedata_file}"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/configure-image-data.sh"] + } + + provisioner "shell" { + environment_vars = ["IMAGE_VERSION=${var.image_version}", "IMAGE_OS=${var.image_os}", "HELPER_SCRIPTS=${var.helper_script_folder}"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/configure-environment.sh"] + } + + provisioner "shell" { + environment_vars = ["DEBIAN_FRONTEND=noninteractive", "HELPER_SCRIPTS=${var.helper_script_folder}", "INSTALLER_SCRIPT_FOLDER=${var.installer_script_folder}"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/install-apt-vital.sh"] + } + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPTS=${var.helper_script_folder}", "INSTALLER_SCRIPT_FOLDER=${var.installer_script_folder}"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/install-powershell.sh"] + } + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPTS=${var.helper_script_folder}", "INSTALLER_SCRIPT_FOLDER=${var.installer_script_folder}"] + execute_command = "sudo sh -c '{{ .Vars }} pwsh -f {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/Install-PowerShellModules.ps1", "${path.root}/../scripts/build/Install-PowerShellAzModules.ps1"] + } + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPTS=${var.helper_script_folder}", "INSTALLER_SCRIPT_FOLDER=${var.installer_script_folder}", "DEBIAN_FRONTEND=noninteractive"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = [ + "${path.root}/../scripts/build/install-actions-cache.sh", + "${path.root}/../scripts/build/install-runner-package.sh", + "${path.root}/../scripts/build/install-apt-common.sh", + "${path.root}/../scripts/build/install-azure-cli.sh", + "${path.root}/../scripts/build/install-aws-tools.sh", + "${path.root}/../scripts/build/install-codeql-bundle.sh", + "${path.root}/../scripts/build/install-container-tools.sh", + "${path.root}/../scripts/build/install-git.sh", + "${path.root}/../scripts/build/install-git-lfs.sh", + "${path.root}/../scripts/build/install-github-cli.sh", + "${path.root}/../scripts/build/install-miniconda.sh", + "${path.root}/../scripts/build/install-ruby.sh", + "${path.root}/../scripts/build/configure-dpkg.sh", + "${path.root}/../scripts/build/install-yq.sh", + "${path.root}/../scripts/build/install-pypy.sh", + "${path.root}/../scripts/build/install-python.sh", + "${path.root}/../scripts/build/install-zstd.sh" + ] + } + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPTS=${var.helper_script_folder}", "INSTALLER_SCRIPT_FOLDER=${var.installer_script_folder}", "DOCKERHUB_LOGIN=${var.dockerhub_login}", "DOCKERHUB_PASSWORD=${var.dockerhub_password}"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/install-docker-compose.sh", "${path.root}/../scripts/build/install-docker.sh"] + } + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPTS=${var.helper_script_folder}", "INSTALLER_SCRIPT_FOLDER=${var.installer_script_folder}"] + execute_command = "sudo sh -c '{{ .Vars }} pwsh -f {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/Install-Toolset.ps1", "${path.root}/../scripts/build/Configure-Toolset.ps1"] + } + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPTS=${var.helper_script_folder}", "INSTALLER_SCRIPT_FOLDER=${var.installer_script_folder}"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/install-pipx-packages.sh"] + } + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPTS=${var.helper_script_folder}", "DEBIAN_FRONTEND=noninteractive", "INSTALLER_SCRIPT_FOLDER=${var.installer_script_folder}"] + execute_command = "/bin/sh -c '{{ .Vars }} {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/install-homebrew.sh"] + } + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPTS=${var.helper_script_folder}"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/configure-snap.sh"] + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + expect_disconnect = true + inline = ["echo 'Reboot VM'", "sudo reboot"] + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + pause_before = "1m0s" + scripts = ["${path.root}/../scripts/build/cleanup.sh"] + start_retry_timeout = "10m" + } + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPT_FOLDER=${var.helper_script_folder}", "INSTALLER_SCRIPT_FOLDER=${var.installer_script_folder}", "IMAGE_FOLDER=${var.image_folder}"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = ["${path.root}/../scripts/build/configure-system.sh"] + } + + provisioner "file" { + destination = "/tmp/" + source = "${path.root}/../assets/ubuntu2204.conf" + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["mkdir -p /etc/vsts", "cp /tmp/ubuntu2204.conf /etc/vsts/machine_instance.conf"] + } + + // provisioner "shell" { + // execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + // inline = ["sleep 30", "/usr/sbin/waagent -force -deprovision+user && export HISTSIZE=0 && sync"] + // } + + // ===================================== + // ========== UBICLOUD EXTRAS ========== + // ===================================== + // To able run this image in Ubicloud, we need to remove some Azure specific configurations + + provisioner "shell" { + environment_vars = ["HELPER_SCRIPTS=${var.helper_script_folder}"] + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + scripts = [ + "${path.root}/../scripts/ubicloud/nvidia-driver.sh", + "${path.root}/../scripts/ubicloud/nvidia-container-toolkit.sh", + "${path.root}/../scripts/ubicloud/setup-runner-user.sh" + ] + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["sleep 30"] + } + + // It's Hyper-V Key Value Pair daemon, which is not needed in Ubicloud + // It blocks booting the VM if it's not disabled + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["systemctl disable hv-kvp-daemon.service"] + } + + // Delete the Azure Linux Agent + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["apt -y purge walinuxagent"] + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["rm -rf /var/lib/waagent", "rm -f /var/log/waagent.log"] + } + + // Clean up cloud-init logs and cache to run it again on first boot + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["cloud-init clean --logs --seed"] + } + + // Delete Azure specific cloud-init config files + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["rm -rf /etc/cloud/cloud.cfg.d/90-azure.cfg", "rm -rf /etc/cloud/cloud.cfg.d/10-azure-kvp.cfg"] + } + + // Replace cloud-init datasource_list with default list + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = [ < /etc/cloud/cloud.cfg.d/90_dpkg.cfg +EOF + ] + } + + // Delete Azure specific grub config files + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["rm -rf /etc/default/grub.d/40-force-partuuid.cfg", "rm -rf /etc/default/grub.d/50-cloudimg-settings.cfg"] + } + + // Replace 50-cloudimg-settings with default grub settings + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = [ <> /etc/default/grub.d/50-cloudimg-settings.cfg +EOF + ] + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["update-grub"] + } + + # Docker containers can't resolve DNS addresses by default on our networking + # setup. We will investigate it in depth, and try to find more generic solution. + # Related issue: https://github.com/ubicloud/ubicloud/issues/507 + # Until proper fix, we add custom systemd-resolved configuration. + # Docker gets resolve.conf content from systemd-resolved service. + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = [ + "mkdir -p /etc/systemd/resolved.conf.d", + "echo \"[Resolve]\nDNS=9.9.9.9 149.112.112.112 2620:fe::fe 2620:fe::9\" > /etc/systemd/resolved.conf.d/Ubicloud.conf", + "systemctl restart systemd-resolved.service" + ] + } + + // sysstat is already installed, but it's not enabled by default + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["apt-get -y install sysstat", "systemctl enable sysstat", "systemctl start sysstat"] + } + + // Remove all existing ssh host keys + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["rm /etc/ssh/ssh_host_*key*"] + } + + // Delete the root password + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["passwd -d root"] + } + + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["sync"] + } + + // Delete the packer account + provisioner "shell" { + execute_command = "sudo sh -c '{{ .Vars }} {{ .Path }}'" + inline = ["touch /var/run/utmp", "userdel -f -r packer"] + } + +}