sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_4.1.2  magrittr_2.0.2  fastmap_1.1.0   cli_3.2.0       tools_4.1.2     htmltools_0.5.2 rstudioapi_0.13
##  [8] yaml_2.2.2      jquerylib_0.1.4 stringi_1.7.6   rmarkdown_2.11  knitr_1.37      stringr_1.4.0   xfun_0.29      
## [15] digest_0.6.29   rlang_1.0.1     evaluate_0.14

General-purpose computing on GPU (GPGPU)

GPGPU is the use of a graphics processing unit (GPU), which typically handles computation only for computer graphics, to perform computation in applications traditionally handled by the CPU.

GPGPU is fundamentally a software concept, not a hardware concept; it is a type of algorithm, not a piece of equipment.

GPUs are often conveniently available on standard laptops and desktop computers and can be externally connected to a personal computer.

GPU architecture and memory hierarchy

GPU consists of multiprocessor element that run under the shared-memory threads model. GPUs can run hundreds or thousands of threads in parallel.

  • Local memory
    • Each thread has private local memory.
  • Shared memory
    • Each thread block has shared memory visible to all threads of the block.
  • Global memory
    • All threads have access to the same global memory.

GPGPU platform

  • CUDA
    • CUDA is a software development kit (SDK) and application programming interface (API) that allows using the programming language C/C++ to code algorithms for execution on NVIDIA GPUs.
    • There are extensive CUDA libraries for scientific computing: CUB, Thrust, CuBLAS, CuRAND, CuSparse, ...
  • OpenCL
    • OpenCL is an open-source GPGPU framework that can execute programs on GPUs from many manufacturers including Nvidia and AMD, but lacks some libraries essential for statistical computing.

Heterogeneous programming

  • Basic concepts
    • Host: The CPU and its memory (host memory)
    • Device: The GPU and its memory (device memory)
    • Kernel: Parallel code that launched and executed on a device by many threads, as opposed to only one like regular function
  • Workflow
    • Write code for the device (kernel)
    • Write code for the host to launch the kernel
    • Pass runtime and algorithmic parameters and launch the kernel
    • Manage memory transactions between host and device

GPGPU in R

Examples

n = 10^6
x1 = rnorm(n)
x2 = rnorm(n)

Add two vectors

add_r <- function(x1, x2) {
  out <- rep(length(x1), 0.0)
  for (i in 1:length(x1)) {
    out[i] <- x1[i] + x2[i]
  }
  return(out)
}
add_r
## function(x1, x2) {
##   out <- rep(length(x1), 0.0)
##   for (i in 1:length(x1)) {
##     out[i] <- x1[i] + x2[i]
##   }
##   return(out)
## }
library(compiler)
add_rc <- cmpfun(add_r)
library(Rcpp)
cppFunction('NumericVector add_c(NumericVector x1, NumericVector x2) {
  int n = x1.size();
  NumericVector out (n);
  for(int i = 0; i < n; i++) {
    out[i] = x1[i] + x2[i];
  }
  return out;
}')
add_c
## function (x1, x2) 
## .Call(<pointer: 0x10d0a13e0>, x1, x2)
library(OpenCL)
platform <- oclPlatforms()[[1]] # retrieve available OpenCL platforms
device <- oclDevices(platform)[[2]] # get OpenCL devices for the given platform
ctx <- oclContext(device) # create an OpenCL context for a given device
platform
## OpenCL platform 'Apple'
device
## OpenCL device 'Intel(R) Iris(TM) Plus Graphics 655'
ctx
## OpenCL context <pointer: 0x7fc9b3c61980>
##   Device: OpenCL device 'Intel(R) Iris(TM) Plus Graphics 655'
##   Queue: OpenCL command queue <pointer: 0x7fc9b3ca1690>
##   Default precision: single
# define kernel
code <- c("
   __kernel void kernel_add(
     __global float* output,
     const int n,
     __global const float* input1,
     __global const float* input2
   )
   {
      // get global thread ID  
      int i = get_global_id(0);

      // make sure we do not go out of bounds
      if (i<n) output[i] = input1[i] + input2[i];
   };
")

# create and compile OpenCL kernel code
kernel_add <- oclSimpleKernel(ctx, "kernel_add", code, "single")

# create OpenCL buffer on GPU
d_x1<-clBuffer(ctx, length(x1), "single")
d_x2<-clBuffer(ctx, length(x2), "single")

# copy data from host (CPU) to device (GPU)
add_gpu_copy <- function(d_x, h_x) {
  d_x[] <- h_x
}
add_gpu_copy(d_x1, x1)
add_gpu_copy(d_x2, x2)

# run add kernel using OpenCL
add_gpu <- function(d_x1, d_x2) {
  result <- oclRun(kernel_add, length(d_x1), d_x1, d_x2)
  return(result)
}
library(microbenchmark)
microbenchmark(add_r(x1, x2), add_rc(x1, x2), add_c(x1, x2), x1+x2,
               add_gpu(d_x1, d_x2), add_gpu_copy(d_x1, x1))
## Unit: microseconds
##                    expr        min          lq        mean     median         uq        max neval
##           add_r(x1, x2) 308052.861 326307.1795 341374.9604 338174.843 353688.306 405145.508   100
##          add_rc(x1, x2) 313752.376 327797.1000 344318.2126 340572.758 356251.533 404405.891   100
##           add_c(x1, x2)   1637.994   4282.6765   6154.1836   4527.628   5484.640  48969.388   100
##                 x1 + x2   1200.933   3794.7740   5273.5301   4084.570   4810.689  52635.951   100
##     add_gpu(d_x1, d_x2)     16.730     80.3135    123.5025     91.602    104.036   2998.559   100
##  add_gpu_copy(d_x1, x1)   2469.355   2870.3145   3646.5643   3192.154   4069.506   7848.140   100

Tree-based parallel algorithms

Reduction

Reductions convert an array of elements into a single result:

\[[a_0, a_1, ..., a_{n-1}] \rightarrow\sum_{i=0}^{n-1} a_i.\]

Reductions are useful for implementing log-likelihood calculations, since independent samples contribute additively to the model log-likelihood: \[ l(\beta) = \sum_i l_i(\beta).\]

Scan

Scans take an array and return an array:

\[[a_0, a_1, ..., a_{n-1}] \rightarrow [a_0, a_0 + a_1, ..., \sum_{i=0}^{n-1} a_i].\]

Use CUDA/Opencl libraries
cat OpenclReduce.cpp
## #include <Rcpp.h>
## #include <boost/compute.hpp>
## 
## using namespace Rcpp;
## 
## namespace compute = boost::compute;
## 
## // [[Rcpp::export]]
## float OpenclReduce(std::vector<float>& h_x) {
##     
##     // get the default compute device
##     compute::device device = compute::system::default_device();
##     
##     // create a compute context and command queue
##     compute::context context(device);
##     compute::command_queue queue(context, device);
##     
##     // allocate memory on the device
##     compute::vector<float> d_x(h_x.size(), context);
##     
##     // copy data from the host to the device
##     compute::copy(h_x.begin(), h_x.end(), d_x.begin(), queue);
##     
##     // calculate the sum and save the result back to the host
##     float total;
##     compute::reduce(d_x.begin(), d_x.end(), &total, queue);
##     
##     return total;
## }

Tips of using GPGPU in statistical computing

Learning resources