NA removal

l5 <- c(1,2,NA,4,NA,6)
l5a <- l5[!is.na(l5)]
l5a
## [1] 1 2 4 6
l5b <- c("a","b",NA,"d",NA,"f") 
good.pairs <- complete.cases(l5,l5b) # if we need to join both lists only with complete pairs...
data.frame(a=l5[good.pairs],b=l5b[good.pairs]) # another example: airquality is a data frame
##   a b
## 1 1 a
## 2 2 b
## 3 4 d
## 4 6 f
airquality[4,][c(1,4,3)] # shows cols 1, 4 & 3 from the 4th row
##   Ozone Temp Wind
## 4    18   62 11.5
airquality[complete.cases(airquality),][1:6,] # to show all non NA rows for the first 6 columns
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 7    23     299  8.6   65     5   7
## 8    19      99 13.8   59     5   8

Sample function

Function sample takes a sample of the specified size from the elements of x using either with or without replacement.

coin <- c("Heads","Tails")
sample(coin,8,rep=T)
## [1] "Tails" "Tails" "Tails" "Heads" "Tails" "Tails" "Heads" "Heads"
dice.events <- 1:6  # dice possible results
ten.throws <- sample(dice.events, 10, replace=T)  # balanced dice
ten.throws
##  [1] 5 6 1 5 3 1 6 4 5 6
ten.more.throws <- sample(dice.events, 10, replace=T, prob=c(.1,.1,.1,.1,.1,.5))  # unbalanced dice (50% it falls 6)
ten.more.throws
##  [1] 6 6 6 6 6 6 5 3 6 5
# Without replacement, we can create permutations
xs <- 1:10
sample(xs,10)
##  [1]  2  6  5 10  8  7  3  1  4  9
sample(xs,10)
##  [1]  5  9  3  8  6  7  2  4 10  1
# Eg: we have 50 pacients that we want to separate into two equal size groups (Control or Treatment)
pacients <- data.frame(patient = 1:50,
                       age = rnorm(100, mean = 60, sd = 12))
head(pacients)
##   patient   age
## 1       1 62.34
## 2       2 76.81
## 3       3 67.50
## 4       4 61.09
## 5       5 69.93
## 6       6 69.68
xs <- rep(c("Control", "Treat"),25)
xs
##  [1] "Control" "Treat"   "Control" "Treat"   "Control" "Treat"   "Control"
##  [8] "Treat"   "Control" "Treat"   "Control" "Treat"   "Control" "Treat"  
## [15] "Control" "Treat"   "Control" "Treat"   "Control" "Treat"   "Control"
## [22] "Treat"   "Control" "Treat"   "Control" "Treat"   "Control" "Treat"  
## [29] "Control" "Treat"   "Control" "Treat"   "Control" "Treat"   "Control"
## [36] "Treat"   "Control" "Treat"   "Control" "Treat"   "Control" "Treat"  
## [43] "Control" "Treat"   "Control" "Treat"   "Control" "Treat"   "Control"
## [50] "Treat"
xs <- sample(xs,50)  # a random permutation to remove selection bias
xs
##  [1] "Control" "Control" "Control" "Treat"   "Control" "Control" "Control"
##  [8] "Control" "Treat"   "Treat"   "Treat"   "Control" "Control" "Treat"  
## [15] "Treat"   "Control" "Control" "Control" "Treat"   "Treat"   "Treat"  
## [22] "Control" "Treat"   "Treat"   "Treat"   "Treat"   "Treat"   "Treat"  
## [29] "Treat"   "Treat"   "Treat"   "Control" "Treat"   "Control" "Control"
## [36] "Control" "Treat"   "Control" "Control" "Treat"   "Control" "Control"
## [43] "Control" "Treat"   "Treat"   "Treat"   "Control" "Treat"   "Control"
## [50] "Control"
pacients$group <- as.factor(xs)  # add new column with required information
head(pacients)
##   patient   age   group
## 1       1 62.34 Control
## 2       2 76.81 Control
## 3       3 67.50 Control
## 4       4 61.09   Treat
## 5       5 69.93 Control
## 6       6 69.68 Control
# Eg: simple bootstrap example (from: www.sigmafield.org/2010/05/23/r-function-of-the-day-sample)
rn <- rnorm(1000, 10) # random sample from a normal distribution
# making 1000 samples and keeping the means
sample.means <- replicate(1000, mean(sample(rn, replace = TRUE)))
sample.means[1:20]
##  [1]  9.991 10.032 10.040 10.000 10.064 10.027  9.981 10.067  9.978  9.989
## [11] 10.043 10.057 10.007 10.005  9.977 10.014 10.020  9.949 10.031 10.080
quantile(sample.means, probs = c(0.025, 0.975)) # produces sample quantiles corresponding to the given probabilities
##   2.5%  97.5% 
##  9.947 10.065
# Compare this to the standard parametric technique.
t.test(rn)$conf.int
## [1]  9.943 10.068
## attr(,"conf.level")
## [1] 0.95

split-apply-combine

References: + http://www.sigmafield.org/2009/09/20/r-function-of-the-day-tapply + http://stackoverflow.com/questions/11562656/averaging-column-values-for-specific-sections-of-data-corresponding-to-other-col/11562850#11562850

check also: http://www.jstatsoft.org/v40/i01/paper

We use tapply when: * A dataset that can be broken up into groups * We want to break it up into groups * Within each group, we want to apply a function

The tapply function is useful when we need to break up a vector into groups defined by some classifying factor, compute a function on the subsets, and return the results in a convenient form.

## generate data for medical example
medical.example <- data.frame(patient = 1:100,
        age = rnorm(100, mean = 60, sd = 12),
        treatment = gl(2, 50, labels = c("Treatment", "Control")))
# we want to break the dataset by Treatment, and then find the age's mean 
tapply(medical.example$age, medical.example$treatment, mean)
## Treatment   Control 
##     60.12     60.17
# Another eg
baseball.example <- data.frame(team = gl(5, 5,
    labels = paste("Team", LETTERS[1:5])),
    player = sample(letters, 25), batting.average = runif(25, .200, .400))
# we want to break the dataset by Team, and then find the max batting avg
tapply(baseball.example$batting.average, baseball.example$team, max)
## Team A Team B Team C Team D Team E 
## 0.3852 0.3832 0.3165 0.3686 0.3186
# an artificial eg from R's help:
n <- 17
fac <- factor(rep(1:3, length = n), levels = 1:5)
fac
##  [1] 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2
## Levels: 1 2 3 4 5
table(fac)
## fac
## 1 2 3 4 5 
## 6 6 5 0 0
# this next function, separates all 1:n value into the factors given by fac, 
# and then sum each factor
tapply(1:n, fac, sum)
##  1  2  3  4  5 
## 51 57 45 NA NA
# Let's explicitly sum the first factor just to confirm:
fac == 1
##  [1]  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [12] FALSE  TRUE FALSE FALSE  TRUE FALSE
(1:n)[fac==1]
## [1]  1  4  7 10 13 16
sum((1:n)[fac==1])
## [1] 51
# we can apply any unary function over each factor
tapply(1:n, fac, range)
## $`1`
## [1]  1 16
## 
## $`2`
## [1]  2 17
## 
## $`3`
## [1]  3 15
## 
## $`4`
## NULL
## 
## $`5`
## NULL
tapply(1:n, fac, quantile)
## $`1`
##    0%   25%   50%   75%  100% 
##  1.00  4.75  8.50 12.25 16.00 
## 
## $`2`
##    0%   25%   50%   75%  100% 
##  2.00  5.75  9.50 13.25 17.00 
## 
## $`3`
##   0%  25%  50%  75% 100% 
##    3    6    9   12   15 
## 
## $`4`
## NULL
## 
## $`5`
## NULL

Other possibilities:

df <- data.frame(dive=factor(sample(c("dive1","dive2"),10,replace=TRUE)),
                 speed=runif(10))
df
##     dive    speed
## 1  dive1 0.398424
## 2  dive1 0.725862
## 3  dive1 0.308170
## 4  dive2 0.090411
## 5  dive1 0.886866
## 6  dive2 0.205537
## 7  dive2 0.603882
## 8  dive1 0.323705
## 9  dive2 0.288841
## 10 dive2 0.004747
# 'by()' takes in vectors and applies a function to them
res.by <- by( df$speed, df$dive, mean)
library(taRifx)
## Warning: package 'taRifx' was built under R version 3.1.1
as.data.frame(res.by)
##    IDX1  value
## 1 dive1 0.5286
## 2 dive2 0.2387
# takes in data.frames, outputs data.frames, and uses a formula interface.
aggregate( speed ~ dive, df, mean )
##    dive  speed
## 1 dive1 0.5286
## 2 dive2 0.2387
# Check also library plyr which facilitates this type of data frama manipulation

Rle function

Function rle computes the lengths and values of runs of equal values in a vector.

Function inverse.rle does the reverse operation.

coin <- c("H","T")
experiment <- sample(coin,100,rep=T)
experiment
##   [1] "T" "T" "T" "H" "T" "H" "H" "H" "H" "H" "T" "H" "H" "T" "H" "H" "T"
##  [18] "T" "T" "T" "T" "H" "T" "T" "T" "T" "T" "H" "H" "H" "H" "H" "T" "T"
##  [35] "T" "T" "H" "T" "T" "T" "T" "H" "H" "H" "H" "T" "H" "H" "T" "H" "T"
##  [52] "T" "H" "T" "H" "H" "T" "H" "H" "H" "H" "H" "T" "T" "T" "T" "H" "T"
##  [69] "T" "T" "H" "H" "H" "T" "T" "H" "T" "T" "T" "T" "T" "T" "H" "T" "H"
##  [86] "H" "H" "T" "T" "H" "H" "H" "H" "H" "H" "T" "H" "H" "H" "H"
experiment.rle <- rle(experiment)
max.seq <- max(experiment.rle$lengths) # the maximum sequence of equal values
max.seq
## [1] 6
experiment.rle$values[which(experiment.rle$lengths == max.seq)] # was it heads or tails?
## [1] "T" "H"
# we can compute both max sequences of heads and tails by using tapply:
tapply(experiment.rle$lengths, experiment.rle$values, max)
## H T 
## 6 6
inverse.rle(experiment.rle) #  recomputes the initial sequence
##   [1] "T" "T" "T" "H" "T" "H" "H" "H" "H" "H" "T" "H" "H" "T" "H" "H" "T"
##  [18] "T" "T" "T" "T" "H" "T" "T" "T" "T" "T" "H" "H" "H" "H" "H" "T" "T"
##  [35] "T" "T" "H" "T" "T" "T" "T" "H" "H" "H" "H" "T" "H" "H" "T" "H" "T"
##  [52] "T" "H" "T" "H" "H" "T" "H" "H" "H" "H" "H" "T" "T" "T" "T" "H" "T"
##  [69] "T" "T" "H" "H" "H" "T" "T" "H" "T" "T" "T" "T" "T" "T" "H" "T" "H"
##  [86] "H" "H" "T" "T" "H" "H" "H" "H" "H" "H" "T" "H" "H" "H" "H"

Split/Cut functions

Function split divides the data in the vector x into the groups defined by f. The replacement forms replace values corresponding to such a division.

Function cut divides the range of x into intervals and codes the values in x according to which interval they fall. The leftmost interval corresponds to level one, the next leftmost to level two and so on.

split(1:10, 1:2)
## $`1`
## [1] 1 3 5 7 9
## 
## $`2`
## [1]  2  4  6  8 10
split(1:10, 1:2)[[1]]  # or: split(1:10, 1:2)$"1"
## [1] 1 3 5 7 9
ma <- cbind(x = 1:10, y = (-4:5)^2)
ma
##        x  y
##  [1,]  1 16
##  [2,]  2  9
##  [3,]  3  4
##  [4,]  4  1
##  [5,]  5  0
##  [6,]  6  1
##  [7,]  7  4
##  [8,]  8  9
##  [9,]  9 16
## [10,] 10 25
split(ma, colnames(ma))
## $x
##  [1]  1  3  5  7  9 16  4  0  4 16
## 
## $y
##  [1]  2  4  6  8 10  9  1  1  9 25
split(ma, col(ma))
## $`1`
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $`2`
##  [1] 16  9  4  1  0  1  4  9 16 25
split(ma, col(ma))[[1]] 
##  [1]  1  2  3  4  5  6  7  8  9 10
ma[,1]                   # easier alternative
##  [1]  1  2  3  4  5  6  7  8  9 10
ma[,"x"]                 # another alternative
##  [1]  1  2  3  4  5  6  7  8  9 10
xs <- 1:20
split(xs,factor(c("yes","no","yes","yes"))) # every four elements are attached to the given level sequence "yes","no","yes","yes"
## $no
## [1]  2  6 10 14 18
## 
## $yes
##  [1]  1  3  4  5  7  8  9 11 12 13 15 16 17 19 20
is <- split(xs,factor(c("yes","no","yes","yes")))[["yes"]]
xs[is]
##  [1]  1  3  4  5  7  8  9 11 12 13 15 16 17 19 20
# Use of cut (from: www.sigmafield.org/2009/09/23/r-function-of-the-day-cut)
# generate some data 
clinical.trial <-
    data.frame(patient = 1:100, age = rnorm(100, mean = 60, sd = 8),
               year = sample(paste("19", 85:99, sep = ""), 100, replace = T))
# Now, we will use the cut function to make age a factor, ie, 
# age will be a categorical variable
c1 <- cut(clinical.trial$age, breaks = 4)
table(c1)
## c1
## (39.8,50.4]   (50.4,61]   (61,71.5] (71.5,82.1] 
##           9          43          44           4
# year.enroll is a factor, so must convert to numeric first!
years <- as.numeric(as.character(clinical.trial$year))
years
##   [1] 1999 1995 1991 1995 1996 1998 1989 1988 1987 1999 1991 1998 1994 1992
##  [15] 1990 1989 1987 1996 1995 1987 1990 1990 1997 1994 1999 1999 1995 1989
##  [29] 1996 1987 1992 1992 1997 1987 1988 1997 1985 1985 1989 1988 1985 1990
##  [43] 1986 1999 1997 1997 1995 1986 1988 1996 1989 1987 1999 1991 1999 1994
##  [57] 1990 1993 1995 1990 1988 1995 1995 1985 1989 1987 1989 1992 1988 1998
##  [71] 1985 1998 1989 1986 1987 1993 1990 1999 1998 1998 1993 1998 1986 1997
##  [85] 1991 1989 1989 1990 1994 1998 1990 1994 1994 1999 1987 1999 1985 1985
##  [99] 1997 1986
c2 <- cut(years, breaks = 3)
table(c2)
## c2
## (1985,1990] (1990,1994] (1994,1999] 
##          37          26          37
# The previous intervals where determined by R. 
# We can force our own (in this case, I'll use 'seq')
c3 <- cut(clinical.trial$age, breaks = seq(30, 80, by = 10))
table(c3)
## c3
## (30,40] (40,50] (50,60] (60,70] (70,80] 
##       1       7      41      45       5
# the interval size can also be variable:
age.cat <- function(x, lower = 0, upper, by = 10,
                   sep = "-", above.char = "+") {

 labs <- c(paste(seq(lower, upper - by, by = by),
                 seq(lower + by - 1, upper - 1, by = by),
                 sep = sep),
           paste(upper, above.char, sep = ""))

 cut(floor(x), breaks = c(seq(lower, upper, by = by), Inf),
     right = FALSE, labels = labs)
}
# This function categorizes age in a fairly flexible way. The first assignment to labs inside the function creates a vector of labels. Then, the cut function is called to do the work, with the custom labels as an argument.

# only specifying an upper bound, uses 0 as lower bound, and breaks up categories by 10
table(age.cat(clinical.trial$age, upper = 70))
## 
##   0-9 10-19 20-29 30-39 40-49 50-59 60-69   70+ 
##     0     0     0     1     7    41    45     6
# now specifying a lower bound
table(age.cat(clinical.trial$age, lower = 30, upper = 70))
## 
## 30-39 40-49 50-59 60-69   70+ 
##     1     7    41    45     6
# now specifying a lower bound AND the "by" argument 
table(age.cat(clinical.trial$age, lower = 30, upper = 70, by = 5))
## 
## 30-34 35-39 40-44 45-49 50-54 55-59 60-64 65-69   70+ 
##     0     1     1     6    17    24    25    20     6

Subsetting

subset returns subsets of vectors, matrices or data frames which meet conditions.

head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
head(subset(airquality, Temp > 80, select = c(Ozone, Temp)))
##    Ozone Temp
## 29    45   81
## 35    NA   84
## 36    NA   85
## 38    29   82
## 39    NA   87
## 40    71   90
head(subset(airquality, Day == 1, select = -Temp))
##     Ozone Solar.R Wind Month Day
## 1      41     190  7.4     5   1
## 32     NA     286  8.6     6   1
## 62    135     269  4.1     7   1
## 93     39      83  6.9     8   1
## 124    96     167  6.9     9   1
head(subset(airquality, select = Ozone:Wind))
##   Ozone Solar.R Wind
## 1    41     190  7.4
## 2    36     118  8.0
## 3    12     149 12.6
## 4    18     313 11.5
## 5    NA      NA 14.3
## 6    28      NA 14.9