Using IBM Power Partition¶
For testing your application on the IBM Power partition, you need to prepare a job script for that partition or use the interactive job:
scalloc -N 1 -c 192 -A PROJECT-ID -p p07-power --time=08:00:00
where:
-N 1
means allocation single node,-c 192
means allocation 192 cores (threads),-p p07-power
is IBM Power partition,--time=08:00:00
means allocation for 8 hours.
On the partition, you should reload the list of modules:
ml architecture/ppc64le
The platform offers both GNU
based and proprietary IBM toolchains for building applications. IBM also provides optimized BLAS routines library (ESSL), which can be used by both toolchain.
Building Applications¶
Our sample application depends on BLAS
, therefore we start by loading following modules (regardless of which toolchain we want to use):
ml GCC OpenBLAS
GCC Toolchain¶
In the case of GCC toolchain we can go ahead and compile the application as usual using either g++
g++ -lopenblas hello.cpp -o hello
or gfortran
gfortran -lopenblas hello.f90 -o hello
as usual.
IBM Toolchain¶
The IBM toolchain requires additional environment setup as it is installed in /opt/ibm
and is not exposed as a module
IBM_ROOT=/opt/ibm
OPENXLC_ROOT=$IBM_ROOT/openxlC/17.1.1
OPENXLF_ROOT=$IBM_ROOT/openxlf/17.1.1
export PATH=$OPENXLC_ROOT/bin:$PATH
export LD_LIBRARY_PATH=$OPENXLC_ROOT/lib:$LD_LIBRARY_PATH
export PATH=$OPENXLF_ROOT/bin:$PATH
export LD_LIBRARY_PATH=$OPENXLF_ROOT/lib:$LD_LIBRARY_PATH
from there we can use either ibm-clang++
ibm-clang++ -lopenblas hello.cpp -o hello
or xlf
xlf -lopenblas hello.f90 -o hello
to build the application as usual.
Note
Combination of xlf
and openblas
seems to cause severe performance degradation. Therefore ESSL
library should be preferred (see below).
Using ESSL Library¶
The ESSL library is installed in /opt/ibm/math/essl/7.1
so we define additional environment variables
IBM_ROOT=/opt/ibm
ESSL_ROOT=${IBM_ROOT}math/essl/7.1
export LD_LIBRARY_PATH=$ESSL_ROOT/lib64:$LD_LIBRARY_PATH
The simplest way to utilize ESSL
in application, which already uses BLAS
or CBLAS
routines is to link with the provided libessl.so
. This can be done by replacing -lopenblas
with -lessl
or -lessl -lopenblas
(in case ESSL
does not provide all required BLAS
routines).
In practice this can look like
g++ -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.cpp -o hello
or
gfortran -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.f90 -o hello
and similarly for IBM compilers (ibm-clang++
and xlf
).
Hello World Applications¶
The hello world
example application (written in C++
and Fortran
) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).
Stationary probability vector estimation in C++
:
#include <iostream>
#include <vector>
#include <chrono>
#include "cblas.h"
const size_t ITERATIONS = 32;
const size_t MATRIX_SIZE = 1024;
int main(int argc, char *argv[])
{
const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;
std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));
for(size_t i = 0; i < MATRIX_SIZE; ++i)
a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
a[0] = 0.5f;
std::vector<float> w1(matrixElements, 0.0f);
std::vector<float> w2(matrixElements, 0.0f);
std::copy(a.begin(), a.end(), w1.begin());
std::vector<float> *t1, *t2;
t1 = &w1;
t2 = &w2;
auto c1 = std::chrono::steady_clock::now();
for(size_t i = 0; i < ITERATIONS; ++i)
{
std::fill(t2->begin(), t2->end(), 0.0f);
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
1.0f, t1->data(), MATRIX_SIZE,
a.data(), MATRIX_SIZE,
1.0f, t2->data(), MATRIX_SIZE);
std::swap(t1, t2);
}
auto c2 = std::chrono::steady_clock::now();
for(size_t i = 0; i < MATRIX_SIZE; ++i)
{
std::cout << (*t1)[i*MATRIX_SIZE + i] << " ";
}
std::cout << std::endl;
std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;
return 0;
}
Stationary probability vector estimation in Fortran
:
program main
implicit none
integer :: matrix_size, iterations
integer :: i
real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
real, pointer :: out_data(:), out_diag(:)
integer :: cr, cm, c1, c2
iterations = 32
matrix_size = 1024
call system_clock(count_rate=cr)
call system_clock(count_max=cm)
allocate(a(matrix_size, matrix_size))
allocate(w1(matrix_size, matrix_size))
allocate(w2(matrix_size, matrix_size))
a(:,:) = 1.0 / real(matrix_size)
a(:,1) = 0.5 / real(matrix_size - 1)
a(1,1) = 0.5
w1 = a
w2(:,:) = 0.0
t1 => w1
t2 => w2
call system_clock(c1)
do i = 0, iterations
t2(:,:) = 0.0
call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)
tmp => t1
t1 => t2
t2 => tmp
end do
call system_clock(c2)
out_data(1:size(t1)) => t1
out_diag => out_data(1::matrix_size+1)
print *, out_diag
print *, "Elapsed Time: ", (c2 - c1) / real(cr)
deallocate(a)
deallocate(w1)
deallocate(w2)
end program main