Efficient Programming in S-PLUS By Tim Hesterberg, 7/2000, 12/02, 3/04, 9/06, 9/07 OUTLINE: Vectorize, vectorize, vectorize! avoid for loops avoid growing data sets Useful when vectorizing: rep() colSums(), rowSums(), (and *Means, *Vars, ...) cumsum, cumprod, cummax, cummin ifelse x[some elements] <- value outer() (requires vectorized function) any(), all(), | and & (vectorized versions of || and &&) pmin(), pmax() cut(), table(), tabulate() match(), match.data.frame(), is.element(), approx() order, sort.list(..., partial), sort(..., partial) which(), "%%", call, do.call Read help files to learn these and other functions. generate random numbers many at a time. When you can't vectorize: lapply() and apply() move calculations out of loops Use fast objects matrices and vectors, not data frames For data frames, use $ rather than [], [j] rather than [,j] pull vectors out of a larger object avoid names on vectors checking for missing values Use fast functions functions that use .Internal functions that are themselves vectorized write your functions to be vectorized (in all relevant arguments) recursion; avoid, or small argument lists Miscellaneous reuse calculations reuse object avoid writing to frame 0 or 1 repeatedly save intermediate results only occasionally do as many calculations as possible on smaller objects use utilities for checking time and memory Code review When efficiency is a big issue: avoid function calls pull out the guts of functions you are calling call existing C code avoid error-checking in interior functions avoid memory growth, use For() loops call new C code Case Study Miscellaneous emacs numerically-accurate computing ----------------------------------------------- FURTHER INFORMATION Two chapters in the S-PLUS Programmer's Guide contain hints on efficient programming. S+ Programmers Guide, "Using Less Time & Memory" Vectorized Arithmetic Avoid for loops Avoid growing data sets Avoid looping over named objects Keep it simple! Reuse computations Avoid Recursion S+ Programmers Guide, "Simulations in S-PLUS" For function lapply monitoring and recovery reduce number of calls that change .Random.seed hybrid of for and lapply listen to hard disk Also see Venables & Ripley, "S Programming". (Below, V&R refers to their "Modern Applied Statistics with S-PLUS"; I modify some of their examples to improve speed. Note that their originals are written to demonstrate various points, not to be as fast as possible.) ----------------------------------------------- Vectorize, vectorize, vectorize! * avoid growing data sets * avoid for loops x <- runif(1000) # data used in examples below y <- runif(300) # Bad: z <- numeric(0) n <- length(x) for(i in 1:(n-1)) z <- c(z, x[i] + x[i+1]) # 2.21 seconds # Good: z <- x[-length(x)] + x[-1] # .01 seconds ---- * outer() (requires vectorized function) * auto-replication * rep(), rep(..., each) # Bad: # gave up after 622 seconds z <- matrix(as.double(NA), length(x), length(y)) # Should initialize with NA for(i in 1:length(x)) for(j in 1:length(y)) z[i,j] <- x[i] + y[j] # Good: z <- outer(x, y, "+") # 2.24 seconds # Better: z <- rep(x, length(y)) + rep(y, each=length(x)) dim(z) <- c(length(x), length(y)) # 1.29 seconds # Best: z <- x + rep(y, each=length(x)) # take advantage of auto-replication of x dim(z) <- c(length(x), length(y)) # .85 seconds # Bit slower: z <- structure(x + rep(y, each=length(x)), dim= c(length(x), length(y))) # .96 seconds # Comment - use structure() to create data and attach dim or other # attributes in one shot. Particularly useful at the end of a function, # to create the return value, e.g. compare these two: function(x){ ... result <- someCalculation dim(result) <- c(n,p) result } function(x){ ... structure(someCalculation, dim=c(n,p)) } ---- * Avoid defining unnecessary objects # OK: (code inside an old version of outer()) # 22 seconds a <- matrix(X, length(X), length(Y)) b <- matrix(Y, length(X), length(Y), byrow=T) ans <- FUN(a, b, ...) # Better: (I've checked this in) # 13 seconds ans <- FUN(rep(X, length(Y)), rep(Y, each=length(X)), ...) # Comment - did not rely on auto-replication of X, because don't know # if FUN auto-replicates arguments. ---- * colSums(), rowSums(), (and *Means, *Vars, *Stdevs, *Maxs, *Mins, *Ranges) # Bad: zmeans <- rep(NA, ncol(z)) for(j in 1:ncol(z)) zmeans[j] <- mean(z[,j]) # OK: zmeans <- rep(1, nrow(z)) %*% z # but requires multiplication, not just add # Better: zmeans <- apply(z, 2, mean) # Best: zmeans <- colMeans(z) # Bad: V&R code for permutation sign test for paired comparisons. # Data is d, assumed length 10, in practice is difference between paired obs d <- rnorm(10) ttest <- function(x) mean(x)/sqrt(var(x)/length(x)) n <- 1000 res <- numeric(n) for(i in 1:n) { x <- d*sign(runif(10)-0.5) res[i] <- ttest(x) } # 120 seconds # Good: # .27 seconds signed.d <- sign(runif(10 * 1000) - .5) * d dim(signed.d) <- c(10, 1000) res <- colMeans(signed.d)/colStdevs(signed.d) * sqrt(length(d)) ---- * cumsum, cumprod, cummax, cummin # Bad: cx <- x[1] for(i in 2:length(x)) cx[i] <- cx[i-1] + x[i] # Good: cx <- cumsum(x) ---- * ifelse * x[some elements] <- value * pmin(), pmax() # Bad: (S+2000 version of dhyper) dhyper <- function(q, m, n, k){ # Some initial code omitted here; all arguments may be vector-valued p <- q dhyper2 <- function(q, m, n, k) { if(!is.finite(q)) p <- NA else if(q < k - n) p <- 0 else if(q > m || q > k) p <- 0 else if(k == m + n) p <- ifelse(q == m, 1, 0) else if(q == 0) p <- exp(lgamma(n + 1) + lgamma(m + n - k + 1) - lgamma(n - k + 1) - lgamma(m + n + 1)) else if(q == k) p <- exp(lgamma(m + 1) + lgamma(m + n - k + 1) - lgamma(m - k + 1) - lgamma(m + n + 1)) else p <- exp(lgamma(m + 1) + lgamma(n + 1) + lgamma(m + n - k + 1) + lgamma(k + 1) - (lgamma(m - q + 1) + lgamma(q + 1) + lgamma(n - k + q + 1) + lgamma(k - q + 1) + lgamma(m + n + 1))) p } for(i in seq(along = q)) p[i] <- dhyper2(q[i], m[i], n[i], k[i]) p } # Possible version, using ifelse(): # ifelse(logical vector, result when T, result when F) dhyper <- function(q, m, n, k){ # Some initial code omitted here ifelse(!is.finite(q), NA, ifelse(q < k-n, 0, ifelse(q > m | q > k, 0, # Note use of | instead of || ifelse(q == 0, exp(lgamma(n + 1) + lgamma(m + n - k + 1) - lgamma(n - k + 1) - lgamma(m + n + 1)), ifelse(q == k, exp(lgamma(m + 1) + lgamma(m + n - k + 1) - lgamma(m - k + 1) - lgamma(m + n + 1)), exp(lgamma(m + 1) + lgamma(n + 1) + lgamma(m + n - k + 1) + lgamma(k + 1) - (lgamma(m - q + 1) + lgamma(q + 1) + lgamma(n - k + q + 1) + lgamma(k - q + 1) + lgamma(m + n + 1))) ))))) } # Don't need to define p; just return it. # This would be faster for long vectors, but slower for short, because # everything is computed in every case. # (This is a rather extreme example, in terms of how many cases there are.) # Would produce error messages from lgamma, when any of the arguments are <= 0. #New version: dhyper <- function(q, m, n, k){ # Some initial code omitted here N <- n + m p <- q * 0 p[!is.finite(q)] <- NA nonzero <- is.finite(q) & (q <= pmin(m, k)) & (q >= pmax(0, k-n)) lchoose <- function(N, m, which) (lgamma(N[which]+1) - lgamma(m[which]+1) - lgamma((N-m)[which]+1)) p[nonzero] <- exp(lchoose(m, q, nonzero) + lchoose(n, k-q, nonzero) - lchoose(N, k, nonzero)) p } # defined N = n+m, because re-use it a few times (some not shown) # defined p <- q * 0; this did: # + p has same dimensions as q # + set all values to zero; now only need to deal with nonzero cases # use pmin() and pmax() # p[some cases] <- value (for those cases) # lgamma() is fast; don't worry about all those extra cases to avoid # calling it as many times. # Not shown; make sure that all of p, m, n, k are same length. # Don't rely on auto-replication, is wrong if e.g. lengths are 3, 4, 12 # Old version: 226 seconds # New version: 6.7 seconds ------------------------------------------------ * When you must loop, use lapply(), sapply(), or apply(), not for(). (Caveat - speed difference is smaller in S+6 than it was in S+2000.) # Bad: result <- rep(NA, 1000) for(i in 1:1000) result[i] <- sum(sample(18, 9)) # Good: result <- sapply(1:1000, function(i) sum(sample(18,9))) # Faster, but uglier: result <- unlist(lapply(1:1000, function(i) sum(sample(18,9)))) # Bad: result <- z for(j in 1:ncol(z)) result[,j] <- cumsum(z[,j]) # Good: result <- apply(z, 2, cumsum) # Comment - lapply() is "officially" used to process elements of a list, # lapply(a list, FUN, ...) # But it is much more general than that. # The first argument need not be a list, it may be a vector of indices, # which may even be ignored (as in the previous example). # Can loop over the second argument to a function if give first arg by name: lapply(0:4/10, mean, x=x) # pass different trim arguments # Comment - apply() and sapply() are front-ends to lapply. # They are slightly slower, but produce cleaner code. ------------ # Can loop over two objects, e.g. operate on columns of X and Y result <- rep(NA, ncol(X)) for(i in 1:ncol(X)){ result[i] <- g(X[,i], Y[,i])} # sapply(1:ncol(X), function(i) g(X[,i], Y[,i]) # That assumes that g, X, and Y are defined in global memory. Otherwise sapply(1:ncol(X), function(i, g, X, Y) g(X[,i], Y[,i]), g = g, X = X, Y = Y) # Can use sapply to vectorize a non-vectorized function (though still slow) f <- function(right, left=0) { integrate(function(u) u^2, left, right)$integral } f(5) f(5:6) # fails, f is not vectorized sapply(5:6, f) sapply(1:2, function(i, x, y) f(x[i], y[i]), x=5:6, y=0:1) # for two arguments # General syntax: # sapply(x, f) # for one argument # sapply(1:n, function(i, x, f) f(i, x[i]), x=x, f=f) # another way # sapply(1:n, function(i, x, y, f) f(i, x[i], y[i]), x=x, y=y, f=f) # 2 args ------------ * Move calculations outside of loops. # Bad (example from C code): for(i=0; i 0" ------------------------------------------------ Use fast functions Functions that use .Internal functions that are themselves vectorized write your functions to be vectorized (in all relevant arguments) recursion; avoid, or small argument lists There can be a marked difference in the speed of functions. Typically functions that use .Internal calls are fast. Here are versions of sum() and mean() from an old version of S+; sum() was much faster (mean() has since been improved). > sum function(x, ..., na.rm = F) .Internal(sum(x, ..., na.rm = na.rm), "do_summary", T, 116) > mean function(x, trim = 0, na.rm = F, weights = NULL) { if(na.rm) { wnas <- which.na(x) if(length(weights)) wnas <- c(wnas, which.na(weights)) if(length(wnas)) { x <- x[ - wnas] if(length(weights)) weights <- weights[ - wnas] } } if(length(weights)) { if(length(weights) != length(x)) stop("x and weights must have the same length") if(trim) stop("trimming not allowed with weights") return(sum(weights * x)/sum(weights)) } if(mode(x) == "complex") { if(trim > 0) stop("trimming not allowed for complex data") return(sum(x)/length(x)) } x <- as.double(x) if(trim > 0) { if(trim >= 0.5) return(median(x, na.rm = F)) if(!na.rm && length(which.na(x))) return(NA) n <- length(x) i1 <- floor(trim * n) + 1 i2 <- n - i1 + 1 x <- sort(x, unique(c(i1, i2)))[i1:i2] } sum(x)/length(x) } ---- In your own vectorization work, you'll * need to call functions that allow vector-valued arguments, * want to call functions whose internal calculations are vectorized You should also write any functions you write to allow vector-valued arguments, where possible. ---- * Recursion -- avoid if possible, else use small argument lists. # Bad (from V&R?): combinations <- function(n, k, set = 1:n){ # error checking omitted fun <- function(n, k, set){ if(k <= 0) vector(mode(set), 0) else if(k >= n) set else rbind(cbind(set[1], Recall(n - 1, k - 1, set[-1])), Recall(n - 1, k, set[-1])) } fun(n, k, set) } # Note - all error checking is outside of `fun', but # the third argument to fun may be a long vector. # Better: combinations3 <- function(n, k){ # error checking code omitted fun <- function(n, k){ if(k <= 0) integer(0) else if(k >= n) n:1 else cbind(rbind((n), Recall(n - 1, k - 1)), Recall(n - 1, k)) } # use (n) to avoid dimnames on matrix fun(n, k) } # Best (avoid recursion altogether): for(N in 2:n){ Calculate combinations(N, k) for all values of k that will be needed in the next iteration of the loop, using the values of combinations(N-1, *) calculated in the previous iteration of the loop } # A version that does this is in S+Resample. # Caveat: difference is not just recursion, also avoid repeating calculations. ----------------------------------- Miscellaneous reuse calculations reuse object avoid writing to frame 0 or 1 repeatedly save intermediate results only occasionally do as many calculations as possible on smaller objects use utilities for checking time and memory ---- * reuse calculations # Bad: if(length(which.na(x))){ # x <- x[-which.na(x)] # careful -- do this one last! y <- y[-which.na(x)] z <- z[-which.na(x)] x <- x[-which.na(x)] } # Good: if(length(wna <- which.na(x))){ x <- x[wna] y <- y[wna] z <- z[wna] } This saves multiple recalculations. The flip side is that it uses more memory, for saving the object. If this occurs inside a function, I don't believe that it helps to do: rm(wna) inside the function in an attempt to reclaim the memory used by the object. It is usually bad form to define an object inside an expression, because it is easy for someone reading the function to overlook that; the alternative would be wna <- which.na(x) if(length(wna)){ ...} However, a lot of S-PLUS code uses the shortcut when checking for missing data, where objects are only used in the next few lines. ---- * reuse object # Bad: for(i in 1:nblocks){ indices <- matrix(sample(1:n, n*B2, T), n) ... } # Good: indices <- matrix(as.integer(0), n, B2) for(i in 1:nblocks){ indices[] <- sample(1:n, n*B2, T) ... } This reuses the current `indices' rather than creating a new copy, and keeps the existing dimensions. ---------------- * avoid writing to frame 0 or 1 repeatedly * save intermediate results only occasionally I've run many huge Monte Carlo simulations, where it is desirable to save intermediate results, in case something crashes in a later replication (there are surprisingly many functions that crash for certain combinations of the input; e.g. lm() if input is singular, which eventually happens when bootstrapping a model with categorical variables. # Bad: f <- function(replications){ # function for Monte Carlo simulations save <- array(NA, c(1000, 100, replications)) for(i in 1:replications){ save[,,i] <- someCalculations producing matrix[1000, 100] assign("save", save, frame=0) # in case simulation crashes later } } Memory use grows dramatically, one copy of the big object "save" for every iteration. I do use for() loops here. It is often the case in my simulations that it is impractical to vectorize the outermost loop. It is easier, and more important, to vectorize inner loops than outer loops. # Worse (!): f <- function(replications){ # function for Monte Carlo simulations save <- array(NA, c(1000, 100, replications)) for(i in 1:replications){ save[,,i] <- someCalculations producing matrix[1000, 100] remove("save", frame=0) assign("save", save, frame=0) # in case simulation crashes later } } The memory does not get reclaimed until f exits; in fact remove() makes memory growth worse! (This is the kind of thing you learn only by using utilities to check time and memory use.) # Better: f <- function(replications){ # function for Monte Carlo simulations save <- array(NA, c(1000, 100, replications)) for(i in 1:replications){ save[,,i] <- someCalculations producing matrix[1000, 100] if(!(i %% 100)) assign("save", save, frame=0) # save results every 100 replications } } # We have essentially this inside bootstrap(). ---------------------------- * do as many calculations as possible on smaller objects # If n and k are likely to be scalar and x a vector, then # Bad: x + n + k # Good: x + (n + k) Caveat: there are some cases where you should explicitly replicate all objects to the same length before doing other calculations, e.g. here if x is length 12 n is length 3 k is length 4 then x + (n + k) is equivalent to x + (n[c(1:3, 1, 1:3, 1, 1:3, 1)] + rep(k,3)) which is probably not desired. x + n + k is equivalent to (x + n) + k which is poor if lengths are 3, 12, 4. ---- * use utilities for checking time and memory You can't always predict which code will run faster, or which parts of your code are eating up time or memory. The following functions are useful: Time: sys.time() proc.time() Memory: object.size() reports bytes used by an object mem.tally.reset(), mem.tally.report() allocated() reports number of bytes allocated for each frame memory.size() reports bytes currently used by S. If "T" then report max storage() return list describing storage allocated storage.summary() memory use (& waste) in a frame storageSummary() Compare to allocated(); also shows memory for database objects The mem.tally* pair is the most useful for programming purposes. Memory use can affect time. In particular, if you use enough memory that S+ needs to use virtual memory, then execution slows dramatically. x <- 1:10^5 object.size(x) # 400041 f <- function(a){ temp <- mean(a) # Lazy evaluation; make sure that a is evaluated print(allocated()) print(storageSummary(frame=NULL)) invisible(NULL) } f(x) # The allocated call doesn't show the large object. # The storageSummary call does; negative "frame numbers" refer to databases. ---------------------------- When efficiency is a big issue: avoid function calls pull out the guts of functions you are calling call existing C code avoid error-checking in interior functions avoid memory growth, use For() loops call new C code ---- * Avoid function calls Every time you call a function inside another function, S+ has to set up a frame for the new function, match arguments, pass arguments (and copy some of them). You can speed things up by avoiding function calls. You'll have to balance this against the readability of code # OK: numRows(x) # This calls nrow(), which calls dim() # Faster: dim(x)[1] # Caveat - this avoids methods for numRows() # OK: lchoose <- function(N, m, which) (lgamma(N[which]+1) - lgamma(m[which]+1) - lgamma((N-m)[which]+1)) p[nonzero] <- exp(lchoose(m, q, nonzero) + lchoose(n, k-q, nonzero) - lchoose(N, k, nonzero)) # Faster: p[nonzero] <- exp((lgamma(m+1) - lgamma(q+1) - lgamma(m-q+1)) + (lgamma(n+1) - lgamma(k-q+1) - lgamma(n-k+q+1)) - (lgamma(N+1) - lgamma(k+1) - lgamma(N-k+1))) ---- * pull out the guts of functions you are calling perm.test1 <- function(x, y, n.sample = 1000){ ny <- length(y) xy <- c(x, y) n <- length(xy) set.seed(seed) perm.mat <- sapply(1:n.sample, function(x, n) sample(n), n = n) perm.mat <- xy[perm.mat] dim(perm.mat) <- c(n, n.sample) sum.y.dist <- drop(rep(1, ny) %*% perm.mat[1:ny, ]) } # Faster perm.test7 <- function(x, y, n.sample = 1000){ ny <- length(y) xy <- c(x, y) n <- length(xy) set.seed(seed) f <- function(i, n, x = runif(n), partial = 1:n) .Internal(sort.list(x, partial), "S_sort_list", T, 0)[partial] f$n <- n f$partial <- 1:ny perm.mat <- unlist(lapply(1:n.sample, f)) perm.mat <- xy[perm.mat] dim(perm.mat) <- c(ny, n.sample) sum.y.dist <- drop(rep(1, ny) %*% perm.mat) } Nx ny n.sample Old Time New Time ---- --- ------------ ------------- -------------- 100 6 1000 2.0 0.29 500 6 1000 3.1 0.54 1000 6 1000 4.7 0.82 Note - looked inside sample(), used only the relevant parts to define f. Used a trick to redefine default values of function f. Use unlist(lapply) # Faster version (same until end): perm.test8 <- function(x, y, n.sample = 1000){ ny <- length(y) xy <- c(x, y) n <- length(xy) set.seed(seed) f <- function(i, n, x = runif(n), partial = 1:n) .Internal(sort.list(x, partial), "S_sort_list", T, 0)[partial] f$n <- n f$partial <- 1:ny colSums(structure(xy[unlist(lapply(1:n.sample, f))], dim=c(ny, n.sample))) } # Use colSums # Avoid defining objects perm.mat and sum.y.dist ---- * call existing C code perm.test9 <- function(x, y, n.sample = 1000){ ny <- length(y) xy <- c(x, y) n <- length(xy) set.seed(seed) f <- function(i, n, x = runif(n), partial = 1:n) .Internal(sort.list(x, partial), "S_sort_list", T, 0)[partial] f$n <- n f$partial <- 1:ny colSums(xy[unlist(lapply(1:n.sample, f))], n=ny) } # The "n" argument of colSums avoids the need to convert the object # to a matrix first. Could save a bit of time using this function that # drops right to C: fastColMeans <- function(x, n = numRows(x), na.rm = F) { # Treat x as if it has n rows. # Use C code from colMeans p <- length(x)/n .C("S_colSums_NA", as.integer(n), as.integer(p), as.double(x), answer = double(p), divide = TRUE, na.rm = as.integer(na.rm), NAOK = T, specialsok = T, COPY = c(F, F, F, T, F, F))$answer } -------------------------------- Case Study: V&R code for permutation sign test for paired comparisons. # Data is d, assumed length 10, in practice is difference between paired obs d <- rnorm(10) ttest <- function(x) mean(x)/sqrt(var(x)/length(x)) n <- 1000 res <- numeric(n) for(i in 1:n) { x <- d*sign(runif(10)-0.5) res[i] <- ttest(x) } Comments: for() loops are slow; when possible avoid them. length(x) is constant; take it out of ttest, then adjust results at the very end. var(x) = (mean(x^2) - mean(x)^2) * n/(n-1) in this example mean(x^2) is constant, and you've calculated mean(x) anyway. unix.time({ ttest <- function(x) mean(x)/sqrt(var(x)/length(x)) n <- 1000 res <- numeric(n) for(i in 1:n) { x <- d*sign(runif(10)-0.5) res[i] <- ttest(x) }}) # 111.01 4.34 169.0 # 132.75 4.46 198.0 (unix.time gives varying results) # First revision; move some calculations outside the loop unix.time({ n <- 1000 res <- numeric(n) for(i in 1:n) { x <- d*sign(runif(10)-0.5) res[i] <- mean(x) } res <- res/sqrt((mean(d^2) - res^2)/(length(d)-1)) }) # 6.78 0.05 10.0 (much faster) # 9.31 0.00 13.0 # Second revision; use lapply() instead of for(), # and use sum() instead of mean() unix.time({ res <- unlist(lapply(1:1000, function(i, d, l) sum(d*sign(runif(l)-.5)), d=d, l=length(d))) / length(d) res <- res/sqrt((mean(d^2) - res^2)/(length(d)-1)) }) # 2.21 0.00 3.0 # 10.99 0.01 15.0 # 7.80 0.02 10.0 # Third revision; truly vectorize unix.time({ signed.d <- sign(runif(10 * 1000) - .5) * d dim(signed.d) <- c(10, 1000) res <- colMeans(signed.d) res <- res/sqrt((mean(d^2) - res^2)/(length(d)-1)) }) # 0.20 0.05 1.0 # 0.22 0.02 0.0 # Fourth revision; truly vectorize, but don't use fast trick for variance unix.time({ signed.d <- sign(runif(10 * 1000) - .5) * d dim(signed.d) <- c(10, 1000) res <- colMeans(signed.d)/sqrt(colVars(signed.d) / length(d)) }) # 0.30 0.03 0.0 # 0.24 0.03 0.0 # Fifth version; go all out unix.time({ res <- fastColMeans(sign(runif(10 * 1000) - .5) * d, 10) res <- res/sqrt((mean(d^2) - res^2)/(length(d)-1)) }) # 0.15 0.01 0.0 # 0.13 0.03 0.0 # 0.16 0.00 0.0 # All versions give the same results (do set.seed(0) first)