## NA removal

l5 <- c(1,2,NA,4,NA,6)
l5a <- l5[!is.na(l5)]
l5a
## [1] 1 2 4 6
l5b <- c("a","b",NA,"d",NA,"f")
good.pairs <- complete.cases(l5,l5b) # if we need to join both lists only with complete pairs...
data.frame(a=l5[good.pairs],b=l5b[good.pairs]) # another example: airquality is a data frame
##   a b
## 1 1 a
## 2 2 b
## 3 4 d
## 4 6 f
airquality[4,][c(1,4,3)] # shows cols 1, 4 & 3 from the 4th row
##   Ozone Temp Wind
## 4    18   62 11.5
airquality[complete.cases(airquality),][1:6,] # to show all non NA rows for the first 6 columns
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 7    23     299  8.6   65     5   7
## 8    19      99 13.8   59     5   8

## Sample function

Function sample takes a sample of the specified size from the elements of x using either with or without replacement.

coin <- c("Heads","Tails")
sample(coin,8,rep=T)
## [1] "Tails" "Tails" "Tails" "Heads" "Tails" "Tails" "Heads" "Heads"
dice.events <- 1:6  # dice possible results
ten.throws <- sample(dice.events, 10, replace=T)  # balanced dice
ten.throws
##  [1] 5 6 1 5 3 1 6 4 5 6
ten.more.throws <- sample(dice.events, 10, replace=T, prob=c(.1,.1,.1,.1,.1,.5))  # unbalanced dice (50% it falls 6)
ten.more.throws
##  [1] 6 6 6 6 6 6 5 3 6 5
# Without replacement, we can create permutations
xs <- 1:10
sample(xs,10)
##  [1]  2  6  5 10  8  7  3  1  4  9
sample(xs,10)
##  [1]  5  9  3  8  6  7  2  4 10  1
# Eg: we have 50 pacients that we want to separate into two equal size groups (Control or Treatment)
pacients <- data.frame(patient = 1:50,
age = rnorm(100, mean = 60, sd = 12))
head(pacients)
##   patient   age
## 1       1 62.34
## 2       2 76.81
## 3       3 67.50
## 4       4 61.09
## 5       5 69.93
## 6       6 69.68
xs <- rep(c("Control", "Treat"),25)
xs
##  [1] "Control" "Treat"   "Control" "Treat"   "Control" "Treat"   "Control"
##  [8] "Treat"   "Control" "Treat"   "Control" "Treat"   "Control" "Treat"
## [15] "Control" "Treat"   "Control" "Treat"   "Control" "Treat"   "Control"
## [22] "Treat"   "Control" "Treat"   "Control" "Treat"   "Control" "Treat"
## [29] "Control" "Treat"   "Control" "Treat"   "Control" "Treat"   "Control"
## [36] "Treat"   "Control" "Treat"   "Control" "Treat"   "Control" "Treat"
## [43] "Control" "Treat"   "Control" "Treat"   "Control" "Treat"   "Control"
## [50] "Treat"
xs <- sample(xs,50)  # a random permutation to remove selection bias
xs
##  [1] "Control" "Control" "Control" "Treat"   "Control" "Control" "Control"
##  [8] "Control" "Treat"   "Treat"   "Treat"   "Control" "Control" "Treat"
## [15] "Treat"   "Control" "Control" "Control" "Treat"   "Treat"   "Treat"
## [22] "Control" "Treat"   "Treat"   "Treat"   "Treat"   "Treat"   "Treat"
## [29] "Treat"   "Treat"   "Treat"   "Control" "Treat"   "Control" "Control"
## [36] "Control" "Treat"   "Control" "Control" "Treat"   "Control" "Control"
## [43] "Control" "Treat"   "Treat"   "Treat"   "Control" "Treat"   "Control"
## [50] "Control"
pacients$group <- as.factor(xs) # add new column with required information head(pacients) ## patient age group ## 1 1 62.34 Control ## 2 2 76.81 Control ## 3 3 67.50 Control ## 4 4 61.09 Treat ## 5 5 69.93 Control ## 6 6 69.68 Control # Eg: simple bootstrap example (from: www.sigmafield.org/2010/05/23/r-function-of-the-day-sample) rn <- rnorm(1000, 10) # random sample from a normal distribution # making 1000 samples and keeping the means sample.means <- replicate(1000, mean(sample(rn, replace = TRUE))) sample.means[1:20] ## [1] 9.991 10.032 10.040 10.000 10.064 10.027 9.981 10.067 9.978 9.989 ## [11] 10.043 10.057 10.007 10.005 9.977 10.014 10.020 9.949 10.031 10.080 quantile(sample.means, probs = c(0.025, 0.975)) # produces sample quantiles corresponding to the given probabilities ## 2.5% 97.5% ## 9.947 10.065 # Compare this to the standard parametric technique. t.test(rn)$conf.int
## [1]  9.943 10.068
## attr(,"conf.level")
## [1] 0.95

## split-apply-combine

check also: http://www.jstatsoft.org/v40/i01/paper

We use tapply when: * A dataset that can be broken up into groups * We want to break it up into groups * Within each group, we want to apply a function

The tapply function is useful when we need to break up a vector into groups defined by some classifying factor, compute a function on the subsets, and return the results in a convenient form.

## generate data for medical example
medical.example <- data.frame(patient = 1:100,
age = rnorm(100, mean = 60, sd = 12),
treatment = gl(2, 50, labels = c("Treatment", "Control")))
# we want to break the dataset by Treatment, and then find the age's mean
tapply(medical.example$age, medical.example$treatment, mean)
## Treatment   Control
##     60.12     60.17
# Another eg
baseball.example <- data.frame(team = gl(5, 5,
labels = paste("Team", LETTERS[1:5])),
player = sample(letters, 25), batting.average = runif(25, .200, .400))
# we want to break the dataset by Team, and then find the max batting avg
tapply(baseball.example$batting.average, baseball.example$team, max)
## Team A Team B Team C Team D Team E
## 0.3852 0.3832 0.3165 0.3686 0.3186
# an artificial eg from R's help:
n <- 17
fac <- factor(rep(1:3, length = n), levels = 1:5)
fac
##  [1] 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2
## Levels: 1 2 3 4 5
table(fac)
## fac
## 1 2 3 4 5
## 6 6 5 0 0
# this next function, separates all 1:n value into the factors given by fac,
# and then sum each factor
tapply(1:n, fac, sum)
##  1  2  3  4  5
## 51 57 45 NA NA
# Let's explicitly sum the first factor just to confirm:
fac == 1
##  [1]  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [12] FALSE  TRUE FALSE FALSE  TRUE FALSE
(1:n)[fac==1]
## [1]  1  4  7 10 13 16
sum((1:n)[fac==1])
## [1] 51
# we can apply any unary function over each factor
tapply(1:n, fac, range)
## $1 ## [1] 1 16 ## ##$2
## [1]  2 17
##
## $3 ## [1] 3 15 ## ##$4
## NULL
##
## $5 ## NULL tapply(1:n, fac, quantile) ##$1
##    0%   25%   50%   75%  100%
##  1.00  4.75  8.50 12.25 16.00
##
## $2 ## 0% 25% 50% 75% 100% ## 2.00 5.75 9.50 13.25 17.00 ## ##$3
##   0%  25%  50%  75% 100%
##    3    6    9   12   15
##
## $4 ## NULL ## ##$5
## NULL

Other possibilities:

df <- data.frame(dive=factor(sample(c("dive1","dive2"),10,replace=TRUE)),
speed=runif(10))
df
##     dive    speed
## 1  dive1 0.398424
## 2  dive1 0.725862
## 3  dive1 0.308170
## 4  dive2 0.090411
## 5  dive1 0.886866
## 6  dive2 0.205537
## 7  dive2 0.603882
## 8  dive1 0.323705
## 9  dive2 0.288841
## 10 dive2 0.004747
# 'by()' takes in vectors and applies a function to them
res.by <- by( df$speed, df$dive, mean)
library(taRifx)
## Warning: package 'taRifx' was built under R version 3.1.1
as.data.frame(res.by)
##    IDX1  value
## 1 dive1 0.5286
## 2 dive2 0.2387
# takes in data.frames, outputs data.frames, and uses a formula interface.
aggregate( speed ~ dive, df, mean )
##    dive  speed
## 1 dive1 0.5286
## 2 dive2 0.2387
# Check also library plyr which facilitates this type of data frama manipulation

## Rle function

Function rle computes the lengths and values of runs of equal values in a vector.

Function inverse.rle does the reverse operation.

coin <- c("H","T")
experiment <- sample(coin,100,rep=T)
experiment
##   [1] "T" "T" "T" "H" "T" "H" "H" "H" "H" "H" "T" "H" "H" "T" "H" "H" "T"
##  [18] "T" "T" "T" "T" "H" "T" "T" "T" "T" "T" "H" "H" "H" "H" "H" "T" "T"
##  [35] "T" "T" "H" "T" "T" "T" "T" "H" "H" "H" "H" "T" "H" "H" "T" "H" "T"
##  [52] "T" "H" "T" "H" "H" "T" "H" "H" "H" "H" "H" "T" "T" "T" "T" "H" "T"
##  [69] "T" "T" "H" "H" "H" "T" "T" "H" "T" "T" "T" "T" "T" "T" "H" "T" "H"
##  [86] "H" "H" "T" "T" "H" "H" "H" "H" "H" "H" "T" "H" "H" "H" "H"
experiment.rle <- rle(experiment)
max.seq <- max(experiment.rle$lengths) # the maximum sequence of equal values max.seq ## [1] 6 experiment.rle$values[which(experiment.rle$lengths == max.seq)] # was it heads or tails? ## [1] "T" "H" # we can compute both max sequences of heads and tails by using tapply: tapply(experiment.rle$lengths, experiment.rle$values, max) ## H T ## 6 6 inverse.rle(experiment.rle) # recomputes the initial sequence ## [1] "T" "T" "T" "H" "T" "H" "H" "H" "H" "H" "T" "H" "H" "T" "H" "H" "T" ## [18] "T" "T" "T" "T" "H" "T" "T" "T" "T" "T" "H" "H" "H" "H" "H" "T" "T" ## [35] "T" "T" "H" "T" "T" "T" "T" "H" "H" "H" "H" "T" "H" "H" "T" "H" "T" ## [52] "T" "H" "T" "H" "H" "T" "H" "H" "H" "H" "H" "T" "T" "T" "T" "H" "T" ## [69] "T" "T" "H" "H" "H" "T" "T" "H" "T" "T" "T" "T" "T" "T" "H" "T" "H" ## [86] "H" "H" "T" "T" "H" "H" "H" "H" "H" "H" "T" "H" "H" "H" "H" ## Split/Cut functions Function split divides the data in the vector x into the groups defined by f. The replacement forms replace values corresponding to such a division. Function cut divides the range of x into intervals and codes the values in x according to which interval they fall. The leftmost interval corresponds to level one, the next leftmost to level two and so on. split(1:10, 1:2) ##$1
## [1] 1 3 5 7 9
##
## $2 ## [1] 2 4 6 8 10 split(1:10, 1:2)[[1]] # or: split(1:10, 1:2)$"1"
## [1] 1 3 5 7 9
ma <- cbind(x = 1:10, y = (-4:5)^2)
ma
##        x  y
##  [1,]  1 16
##  [2,]  2  9
##  [3,]  3  4
##  [4,]  4  1
##  [5,]  5  0
##  [6,]  6  1
##  [7,]  7  4
##  [8,]  8  9
##  [9,]  9 16
## [10,] 10 25
split(ma, colnames(ma))
## $x ## [1] 1 3 5 7 9 16 4 0 4 16 ## ##$y
##  [1]  2  4  6  8 10  9  1  1  9 25
split(ma, col(ma))
## $1 ## [1] 1 2 3 4 5 6 7 8 9 10 ## ##$2
##  [1] 16  9  4  1  0  1  4  9 16 25
split(ma, col(ma))[[1]] 
##  [1]  1  2  3  4  5  6  7  8  9 10
ma[,1]                   # easier alternative
##  [1]  1  2  3  4  5  6  7  8  9 10
ma[,"x"]                 # another alternative
##  [1]  1  2  3  4  5  6  7  8  9 10
xs <- 1:20
split(xs,factor(c("yes","no","yes","yes"))) # every four elements are attached to the given level sequence "yes","no","yes","yes"
## $no ## [1] 2 6 10 14 18 ## ##$yes
##  [1]  1  3  4  5  7  8  9 11 12 13 15 16 17 19 20
is <- split(xs,factor(c("yes","no","yes","yes")))[["yes"]]
xs[is]
##  [1]  1  3  4  5  7  8  9 11 12 13 15 16 17 19 20
# Use of cut (from: www.sigmafield.org/2009/09/23/r-function-of-the-day-cut)
# generate some data
clinical.trial <-
data.frame(patient = 1:100, age = rnorm(100, mean = 60, sd = 8),
year = sample(paste("19", 85:99, sep = ""), 100, replace = T))
# Now, we will use the cut function to make age a factor, ie,
# age will be a categorical variable
c1 <- cut(clinical.trial$age, breaks = 4) table(c1) ## c1 ## (39.8,50.4] (50.4,61] (61,71.5] (71.5,82.1] ## 9 43 44 4 # year.enroll is a factor, so must convert to numeric first! years <- as.numeric(as.character(clinical.trial$year))
years
##   [1] 1999 1995 1991 1995 1996 1998 1989 1988 1987 1999 1991 1998 1994 1992
##  [15] 1990 1989 1987 1996 1995 1987 1990 1990 1997 1994 1999 1999 1995 1989
##  [29] 1996 1987 1992 1992 1997 1987 1988 1997 1985 1985 1989 1988 1985 1990
##  [43] 1986 1999 1997 1997 1995 1986 1988 1996 1989 1987 1999 1991 1999 1994
##  [57] 1990 1993 1995 1990 1988 1995 1995 1985 1989 1987 1989 1992 1988 1998
##  [71] 1985 1998 1989 1986 1987 1993 1990 1999 1998 1998 1993 1998 1986 1997
##  [85] 1991 1989 1989 1990 1994 1998 1990 1994 1994 1999 1987 1999 1985 1985
##  [99] 1997 1986
c2 <- cut(years, breaks = 3)
table(c2)
## c2
## (1985,1990] (1990,1994] (1994,1999]
##          37          26          37
# The previous intervals where determined by R.
# We can force our own (in this case, I'll use 'seq')
c3 <- cut(clinical.trial$age, breaks = seq(30, 80, by = 10)) table(c3) ## c3 ## (30,40] (40,50] (50,60] (60,70] (70,80] ## 1 7 41 45 5 # the interval size can also be variable: age.cat <- function(x, lower = 0, upper, by = 10, sep = "-", above.char = "+") { labs <- c(paste(seq(lower, upper - by, by = by), seq(lower + by - 1, upper - 1, by = by), sep = sep), paste(upper, above.char, sep = "")) cut(floor(x), breaks = c(seq(lower, upper, by = by), Inf), right = FALSE, labels = labs) } # This function categorizes age in a fairly flexible way. The first assignment to labs inside the function creates a vector of labels. Then, the cut function is called to do the work, with the custom labels as an argument. # only specifying an upper bound, uses 0 as lower bound, and breaks up categories by 10 table(age.cat(clinical.trial$age, upper = 70))
##
##   0-9 10-19 20-29 30-39 40-49 50-59 60-69   70+
##     0     0     0     1     7    41    45     6
# now specifying a lower bound
table(age.cat(clinical.trial$age, lower = 30, upper = 70)) ## ## 30-39 40-49 50-59 60-69 70+ ## 1 7 41 45 6 # now specifying a lower bound AND the "by" argument table(age.cat(clinical.trial$age, lower = 30, upper = 70, by = 5))
##
## 30-34 35-39 40-44 45-49 50-54 55-59 60-64 65-69   70+
##     0     1     1     6    17    24    25    20     6

## Subsetting

subset returns subsets of vectors, matrices or data frames which meet conditions.

head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
head(subset(airquality, Temp > 80, select = c(Ozone, Temp)))
##    Ozone Temp
## 29    45   81
## 35    NA   84
## 36    NA   85
## 38    29   82
## 39    NA   87
## 40    71   90
head(subset(airquality, Day == 1, select = -Temp))
##     Ozone Solar.R Wind Month Day
## 1      41     190  7.4     5   1
## 32     NA     286  8.6     6   1
## 62    135     269  4.1     7   1
## 93     39      83  6.9     8   1
## 124    96     167  6.9     9   1
head(subset(airquality, select = Ozone:Wind))
##   Ozone Solar.R Wind
## 1    41     190  7.4
## 2    36     118  8.0
## 3    12     149 12.6
## 4    18     313 11.5
## 5    NA      NA 14.3
## 6    28      NA 14.9