sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_4.1.2 magrittr_2.0.2 fastmap_1.1.0 cli_3.2.0 tools_4.1.2 htmltools_0.5.2 rstudioapi_0.13
## [8] yaml_2.2.2 jquerylib_0.1.4 stringi_1.7.6 rmarkdown_2.11 knitr_1.37 stringr_1.4.0 xfun_0.29
## [15] digest_0.6.29 rlang_1.0.1 evaluate_0.14
GPGPU is the use of a graphics processing unit (GPU), which typically handles computation only for computer graphics, to perform computation in applications traditionally handled by the CPU.
GPGPU is fundamentally a software concept, not a hardware concept; it is a type of algorithm, not a piece of equipment.
GPUs are often conveniently available on standard laptops and desktop computers and can be externally connected to a personal computer.
GPU consists of multiprocessor element that run under the shared-memory threads model. GPUs can run hundreds or thousands of threads in parallel.
n = 10^6
x1 = rnorm(n)
x2 = rnorm(n)
add_r <- function(x1, x2) {
out <- rep(length(x1), 0.0)
for (i in 1:length(x1)) {
out[i] <- x1[i] + x2[i]
}
return(out)
}
add_r
## function(x1, x2) {
## out <- rep(length(x1), 0.0)
## for (i in 1:length(x1)) {
## out[i] <- x1[i] + x2[i]
## }
## return(out)
## }
library(compiler)
add_rc <- cmpfun(add_r)
library(Rcpp)
cppFunction('NumericVector add_c(NumericVector x1, NumericVector x2) {
int n = x1.size();
NumericVector out (n);
for(int i = 0; i < n; i++) {
out[i] = x1[i] + x2[i];
}
return out;
}')
add_c
## function (x1, x2)
## .Call(<pointer: 0x10d0a13e0>, x1, x2)
library(OpenCL)
platform <- oclPlatforms()[[1]] # retrieve available OpenCL platforms
device <- oclDevices(platform)[[2]] # get OpenCL devices for the given platform
ctx <- oclContext(device) # create an OpenCL context for a given device
platform
## OpenCL platform 'Apple'
device
## OpenCL device 'Intel(R) Iris(TM) Plus Graphics 655'
ctx
## OpenCL context <pointer: 0x7fc9b3c61980>
## Device: OpenCL device 'Intel(R) Iris(TM) Plus Graphics 655'
## Queue: OpenCL command queue <pointer: 0x7fc9b3ca1690>
## Default precision: single
# define kernel
code <- c("
__kernel void kernel_add(
__global float* output,
const int n,
__global const float* input1,
__global const float* input2
)
{
// get global thread ID
int i = get_global_id(0);
// make sure we do not go out of bounds
if (i<n) output[i] = input1[i] + input2[i];
};
")
# create and compile OpenCL kernel code
kernel_add <- oclSimpleKernel(ctx, "kernel_add", code, "single")
# create OpenCL buffer on GPU
d_x1<-clBuffer(ctx, length(x1), "single")
d_x2<-clBuffer(ctx, length(x2), "single")
# copy data from host (CPU) to device (GPU)
add_gpu_copy <- function(d_x, h_x) {
d_x[] <- h_x
}
add_gpu_copy(d_x1, x1)
add_gpu_copy(d_x2, x2)
# run add kernel using OpenCL
add_gpu <- function(d_x1, d_x2) {
result <- oclRun(kernel_add, length(d_x1), d_x1, d_x2)
return(result)
}
library(microbenchmark)
microbenchmark(add_r(x1, x2), add_rc(x1, x2), add_c(x1, x2), x1+x2,
add_gpu(d_x1, d_x2), add_gpu_copy(d_x1, x1))
## Unit: microseconds
## expr min lq mean median uq max neval
## add_r(x1, x2) 308052.861 326307.1795 341374.9604 338174.843 353688.306 405145.508 100
## add_rc(x1, x2) 313752.376 327797.1000 344318.2126 340572.758 356251.533 404405.891 100
## add_c(x1, x2) 1637.994 4282.6765 6154.1836 4527.628 5484.640 48969.388 100
## x1 + x2 1200.933 3794.7740 5273.5301 4084.570 4810.689 52635.951 100
## add_gpu(d_x1, d_x2) 16.730 80.3135 123.5025 91.602 104.036 2998.559 100
## add_gpu_copy(d_x1, x1) 2469.355 2870.3145 3646.5643 3192.154 4069.506 7848.140 100
Reductions convert an array of elements into a single result:
\[[a_0, a_1, ..., a_{n-1}] \rightarrow\sum_{i=0}^{n-1} a_i.\]
Reductions are useful for implementing log-likelihood calculations, since independent samples contribute additively to the model log-likelihood: \[ l(\beta) = \sum_i l_i(\beta).\]
Scans take an array and return an array:
\[[a_0, a_1, ..., a_{n-1}] \rightarrow [a_0, a_0 + a_1, ..., \sum_{i=0}^{n-1} a_i].\]
cat OpenclReduce.cpp
## #include <Rcpp.h>
## #include <boost/compute.hpp>
##
## using namespace Rcpp;
##
## namespace compute = boost::compute;
##
## // [[Rcpp::export]]
## float OpenclReduce(std::vector<float>& h_x) {
##
## // get the default compute device
## compute::device device = compute::system::default_device();
##
## // create a compute context and command queue
## compute::context context(device);
## compute::command_queue queue(context, device);
##
## // allocate memory on the device
## compute::vector<float> d_x(h_x.size(), context);
##
## // copy data from the host to the device
## compute::copy(h_x.begin(), h_x.end(), d_x.begin(), queue);
##
## // calculate the sum and save the result back to the host
## float total;
## compute::reduce(d_x.begin(), d_x.end(), &total, queue);
##
## return total;
## }
Parallelization using GPUs generally perform better on very large sample size. When work on problems with small sample size, the communication latency costs often overpower the gains from the parallelized work.
When optimize an existing program, start with off-loading the most computationally intensive routines to the GPU before rewriting the whole codebase.
Apply GPU-accelerated libraries to gain instant speed-up. We often don't need to write complicated algorithms by ourselves.
CUDA GPU-Accelerated libraries: https://developer.nvidia.com/gpu-accelerated-libraries
A curated list of awesome OpenCL resources: https://github.com/jslee02/awesome-opencl
Introduction to Parallel Computing Tutorial: https://hpc.llnl.gov/documentation/tutorials/introduction-parallel-computing-tutorial
Accelerate R Applications with CUDA: https://developer.nvidia.com/blog/accelerate-r-applications-cuda/
CUDA C++ Programming Guide: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html