Connecting R with C++

Refs:

library(Rcpp)

Inline Functions

cppFunction allows you to write inline C++ functions in R:

cppFunction('

  int add(int x, int y, int z) {
    int sum = x + y + z;
    return sum;
  }

')

add(1,2,3)
## [1] 6

Scalars & Vectors

Some recognized types:

  • Scalars of type double, int, String and bool

  • Vectors of type NumericVector, CharacterVector, IntegerVector and LogicalVector

Here’s an eg of making a vector sum in Rcpp

cppFunction('

  int sumCpp(NumericVector x) {

    int i, sum = 0, n = x.size();

    for(i=0; i<n; i++){    // note: vectors indices start at 0
      sum += x[i];
    }
    return sum;
  }

')

sumR <- function(x) { # the equivalent R function
  sum <- 0
  n = length(x)
  for(i in 1:n) {
    sum <- sum + x[i]
  }
  sum
}

library(microbenchmark)  # let's compare performances
## Warning: package 'microbenchmark' was built under R version 3.1.1
m <- microbenchmark(
  sumR(1:10000),
  sumCpp(1:10000),
  sum(1:10000)      # sum is already C++ optimized
)
m
## Unit: microseconds
##             expr     min      lq    mean  median      uq      max neval
##    sumR(1:10000) 7530.31 7718.04 8615.49 7821.51 9013.41 45363.16   100
##  sumCpp(1:10000)  104.96  114.56  129.64  126.72  138.24   206.51   100
##     sum(1:10000)   24.75   28.59   33.26   31.15   35.63    74.67   100

Matrices

There is also the type for matrices:

  • NumericMatrix, IntegerMatrix, CharacterMatrix and LogicalMatrix

An eg that reproduces rowSums():

cppFunction('

  NumericVector rowSumsC(NumericMatrix x) {
    int nrow = x.nrow(), 
        ncol = x.ncol();
    NumericVector out(nrow);
  
    for (int i = 0; i < nrow; i++) {
      double total = 0;
      for (int j = 0; j < ncol; j++) {
        total += x(i, j);
      }
      out[i] = total;
    }
    return out;
  }

')

x <- matrix(sample(100), 10)
rowSums(x)
##  [1] 614 651 483 510 548 374 471 602 355 442
rowSumsC(x)
##  [1] 614 651 483 510 548 374 471 602 355 442

Using Function, Lists and DataFrames

cppFunction('

  RObject callWithOne(Function f) {
    return f(1);
  }

')

callWithOne(function(x) x*10+1)
## [1] 11
callWithOne(paste)
## [1] "1"
callWithOne(plot)

plot of chunk unnamed-chunk-5

## NULL
cppFunction('

  List lapplyCpp(List input, Function f) {
    int n = input.size();
    List out(n);
  
    for(int i = 0; i < n; i++) {
      out[i] = f(input[i]);
    }

    return out;
  }

')

lapplyCpp(1:3, function(x) 2*x)
## [[1]]
## [1] 2
## 
## [[2]]
## [1] 4
## 
## [[3]]
## [1] 6
cppFunction('

  DataFrame makeDF(NumericVector a, NumericVector b, String d) {
    int n = a.size();
    NumericVector c(n);  

    for(int i = 0; i < n; i++) {
      c[i] = a[i] + b[i];
    }

    return DataFrame::create(_["col1"]=a, _["col2"]=b, _[d]=c);
  }

')

makeDF(1:10,2:11,"result")
##    col1 col2 result
## 1     1    2      3
## 2     2    3      5
## 3     3    4      7
## 4     4    5      9
## 5     5    6     11
## 6     6    7     13
## 7     7    8     15
## 8     8    9     17
## 9     9   10     19
## 10   10   11     21
cppFunction('

  DataFrame makeDF(NumericVector a, NumericVector b, Function f) {
    int n = a.size();
    NumericVector c(n);  

    for(int i = 0; i < n; i++) {
      c[i] = as<double>(f(_["a"] = a[i], 
                          _["b"] = b[i]));
    }

    return DataFrame::create(_["col1"]=a, _["col2"]=b, _["result"]=c);
  }

')

makeDF(1:10,2:11,function(a,b) a*b)
##    col1 col2 result
## 1     1    2      2
## 2     2    3      6
## 3     3    4     12
## 4     4    5     20
## 5     5    6     30
## 6     6    7     42
## 7     7    8     56
## 8     8    9     72
## 9     9   10     90
## 10   10   11    110
makeDF(1:5, 2:6, function(a,b) a^b)
##   col1 col2 result
## 1    1    2      1
## 2    2    3      8
## 3    3    4     81
## 4    4    5   1024
## 5    5    6  15625
cppFunction('
  DataFrame modifyDataFrame(DataFrame df) {
    
    // access the columns
    Rcpp::IntegerVector   a = df["a"];
    Rcpp::CharacterVector b = df["b"];
    
    // make some changes
    a[2] = 42;
    b[1] = "foo";       
    
    // return a new data frame
    return DataFrame::create(_["a1"]= a, _["b1"]= b);
  }
')

df <- data.frame(a = c(1, 2, 3),
                 b = c("x", "y", "z"))

modifyDataFrame(df)
##   a1  b1
## 1  1   x
## 2  2 foo
## 3 42   z

Missing Values

cppFunction('
            
  List scalar_missings() {
    int int_s = NA_INTEGER;
    String chr_s = NA_STRING;
    bool lgl_s = NA_LOGICAL;
    double num_s = NA_REAL;
    NumericVector x = NumericVector::create(NA_REAL);
  
    return List::create(int_s, chr_s, lgl_s, num_s, x);
  }

')

str(scalar_missings())
## List of 5
##  $ : int NA
##  $ : chr NA
##  $ : logi TRUE
##  $ : num NA
##  $ : num NA

But there are a lot of gotchas. Check here.

To check if a value in a vector is missing, use the class method ::is_na()

cppFunction('
            
  int countNAs(NumericVector x) {
    int i, count = 0, n = x.size();
  
    for(i=0; i<n; i++)
      count += NumericVector::is_na(x[i]);
  
    return count;
  }

')

countNAs(c(1,2,NA,NA,4,NA))
## [1] 3

Rcpp Sugar

All the basic arithmetic and logical operators are vectorised: + *, -, /, pow, <, <=, >, >=, ==, !=, !.

cppFunction('
           
  NumericVector pdistC(double x, NumericVector ys) {
  // computes the Euclidean distance between a value and a vector of values          
    return sqrt(pow((x - ys), 2));
  }

')

pdistC(2, c(1.1,2,10))
## [1] 0.9 0.0 8.0
  • The functions any and all are fully lazy.

  • A number of helpful functions provide a “view” of a vector: head(), tail(), rep_each(), rep_len(), rev(), seq_along(), and seq_len(). In R these would all produce copies of the vector, but in Rcpp they simply point to the existing vector and override the subsetting operator ([) to implement special behaviour. This makes them very efficient: for instance, rep_len(x, 1e6) does not have to make a million copies of x.

  • Math functions: abs(), acos(), asin(), atan(), beta(), ceil(), ceiling(), choose(), cos(), cosh(), digamma(), exp(), expm1(), factorial(), floor(), gamma(), lbeta(), lchoose(), lfactorial(), lgamma(), log(), log10(), log1p(), pentagamma(), psigamma(), round(), signif(), sin(), sinh(), sqrt(), tan(), tanh(), tetragamma(), trigamma(), trunc().

  • Scalar summaries: mean(), min(), max(), sum(), sd(), and (for vectors) var().

  • Vector summaries: cumsum(), diff(), pmin(), and pmax().

  • Finding values: match(), self_match(), which_max(), which_min().

  • Dealing with duplicates: duplicated(), unique().

  • d/q/p/r for all standard distributions.

  • noNA(x) asserts that the vector x does not contain any missing values, and allows optimisation of some mathematical operations.

Using C++ STL

It’s possible to use C++’s standard template library which includes lots of data structures and algorithms highly optimized.

Iterators

Iterators abstract the methods for looping over a data structure. There are three operators:

  • Advance with ++.

  • Get the value they refer to, or dereference, with *.

  • Compare with == or !=.

An eg of summing a vector with an iterator:

cppFunction('
           
  double sumIt(NumericVector x) {
    double total = 0;
    
    NumericVector::iterator it;
    for(it = x.begin(); it != x.end(); ++it) {
      total += *it;
    }
    return total;
  }

')

sumIt(c(1:100))
## [1] 5050

A more complex eg:

# given a vector of non-decreasing breakpoints in vec, find the interval containing each element of x
findInterval(x=c(11,-1,51,14,22,31,6), vec=c(0,10,20,30))
## [1] 2 0 4 2 3 4 1
cppFunction('
           
  IntegerVector findIntervalCpp(NumericVector x, NumericVector breaks) {
    IntegerVector out(x.size());
  
    NumericVector::iterator it, pos;
    IntegerVector::iterator out_it;
  
    // step through two iterators (input it and output out) simultaneously
    for(it = x.begin(), out_it = out.begin(); it != x.end(); ++it, ++out_it) {
      pos = std::upper_bound(breaks.begin(), breaks.end(), *it); // upper_bound() returns an iterator
      *out_it = std::distance(breaks.begin(), pos);
    }
  
    return out;
  }

')

findIntervalCpp(c(11,-1,51,14,22,31,6), c(0,10,20,30))
## [1] 2 0 4 2 3 4 1

There are many algorithms available. In non-inline codes, it’s needed to include #include <algorithm> at the top of the cpp file.

Also check the standard containers of STL.

Non-Inline Functions

The previous code is good for inline functions, but for more complex it’s better to write cpp files and then read and compile them at R

cpp.code = "

  #include <cstdlib>
  #include <iostream>
  #include <Rcpp.h>
  #include <omp.h>
  
  using namespace std;
  // [[Rcpp::export]]
  Rcpp::NumericVector parad(Rcpp::NumericVector x, Rcpp::NumericVector y) {
  
    int i,n,max;
    n=x.size();
    Rcpp::NumericVector product(n);
    
    max=omp_get_max_threads();  // eg with parallel version
    omp_set_num_threads(max);
    
    #pragma omp parallel for
    for(i=0; i<n; i++){
      product[i] = x[i] / y[i];
    }
    
    return(product);
  }
"

write(cpp.code, "parad.cpp")  # assume the file was written elsewhere

# Setting terminal environment variables from  within R so that the compiler 
# compiles as "g++ .. -fopenmp ." which we need for the "omp.h" header.

Sys.setenv("PKG_CXXFLAGS"="-fopenmp")
Sys.setenv("PKG_LIBS"="-fopenmp")

sourceCpp("parad.cpp") # sourceCpp is a wrapper that takes care of everything. 

# after that one can immediately start using the parad() function.

a <- rnorm(1000,0,1)
b <- rnorm(1000,0,1)
c <- parad(a,b)
c