ROCm HIP¶

Introduction¶

ROCm HIP allows developers to convert CUDA code to portable C++. The same source code can be compiled to run on NVIDIA or AMD GPUs.

This page documents the use of pre-built Apptainer (previously Singularity) image on Karolina Accelerated nodes (acn).

Get Into GPU Node¶

$ salloc -p qgpu -A PROJECT_ID -t 01:00:00
salloc: Granted job allocation 1543777
salloc: Waiting for resource configuration
salloc: Nodes acn41 are ready for job

Installed Versions of Apptainer¶

For the current list of installed versions, use:

module avail apptainer
# ----------------- /apps/modules/tools ------------------
#   apptainer-wrappers/1.0 (A)    apptainer/1.1.5

Load the required module:

module load apptainer/1.1.5

Launch Apptainer¶

Run the container:

singularity  shell  /home/username/rocm/centos7-nvidia-rocm.sif

The above gives you Apptainer shell prompt:

Singularity>

Inside Container¶

Verify that you have GPUs active and accessible on the given node:

nvidia-smi

You should get output similar to:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.07    Driver Version: 515.65.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA A100-SXM...  Off  | 00000000:07:00.0 Off |                    0 |
| N/A   26C    P0    50W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  Off  | 00000000:0B:00.0 Off |                    0 |
| N/A   26C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   2  NVIDIA A100-SXM...  Off  | 00000000:48:00.0 Off |                    0 |
| N/A   22C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   3  NVIDIA A100-SXM...  Off  | 00000000:4C:00.0 Off |                    0 |
| N/A   25C    P0    52W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   4  NVIDIA A100-SXM...  Off  | 00000000:88:00.0 Off |                    0 |
| N/A   22C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   5  NVIDIA A100-SXM...  Off  | 00000000:8B:00.0 Off |                    0 |
| N/A   26C    P0    54W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   6  NVIDIA A100-SXM...  Off  | 00000000:C8:00.0 Off |                    0 |
| N/A   25C    P0    52W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   7  NVIDIA A100-SXM...  Off  | 00000000:CB:00.0 Off |                    0 |
| N/A   26C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

Code Example¶

In this section, we show a basic code example. You can directly copy and paste the code to test it:

// filename : /tmp/sample.cu

#include <stdio.h>
#include <cuda_runtime.h>

#define CHECK(cmd) \
{\
    cudaError_t error  = cmd;\
    if (error != cudaSuccess) { \
        fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
        exit(EXIT_FAILURE);\
          }\
}


/*
 * Square each element in the array A and write to array C.
 */
template <typename T>
__global__ void
vector_square(T *C_d, T *A_d, size_t N)
{
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x ;

    for (size_t i=offset; i<N; i+=stride) {
        C_d[i] = A_d[i] * A_d[i];
    }
}


int main(int argc, char *argv[])
{
    float *A_d, *C_d;
    float *A_h, *C_h;
    size_t N = 1000000;
    size_t Nbytes = N * sizeof(float);

    cudaDeviceProp props;
    CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/));
    printf ("info: running on device %s\n", props.name);

    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
    A_h = (float*)malloc(Nbytes);
    CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
    C_h = (float*)malloc(Nbytes);
    CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
    // Fill with Phi + i
    for (size_t i=0; i<N; i++)
    {
        A_h[i] = 1.618f + i;
    }

    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
    CHECK(cudaMalloc(&A_d, Nbytes));
    CHECK(cudaMalloc(&C_d, Nbytes));


    printf ("info: copy Host2Device\n");
    CHECK ( cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice));

    const unsigned blocks = 512;
    const unsigned threadsPerBlock = 256;

    printf ("info: launch 'vector_square' kernel\n");
    vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);

    printf ("info: copy Device2Host\n");
    CHECK ( cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost));

    printf ("info: check result\n");
    for (size_t i=0; i<N; i++)  {
        if (C_h[i] != A_h[i] * A_h[i]) {
            CHECK(cudaErrorUnknown);
        }
    }
    printf ("PASSED!\n");
}

First convert the CUDA sample code into HIP code:

cd /tmp
/opt/rocm/hip/bin/hipify-perl sample.cu > sample.cpp

This code can then be compiled using the following commands:

cd /tmp
export HIP_PLATFORM=$( /opt/rocm/hip/bin/hipconfig --platform )
export HIPCC=/opt/rocm/hip/bin/hipcc
$HIPCC sample.cpp -o sample

Running it, you should get the following output:

Singularity> cd /tmp
Singularity> ./sample
info: running on device NVIDIA A100-SXM4-40GB
info: allocate host mem (  7.63 MB)
info: allocate device mem (  7.63 MB)
info: copy Host2Device
info: launch 'vector_square' kernel
info: copy Device2Host
info: check result
PASSED!