Graphical Tools

Scatter plots
Histogram, Boxplot and Barplot
Labels with Math
3D perspectives
Confidence Intervals

# par is used to specify global graphics parameters
# adn affects of subsequent graph functions

par(pch=1)       # defines the plotting symbol
par(lty=2)       # defines the line type
par(lwd=1)       # line's width
par(col="black") # plotting color
par(bg="white")  # background color

par(mfrow=c(2,2))  # number of plots/(row,column) (plots filled row-wise)
par(mfcol=c(2,2))  # number of plots/(row,column) (plots filled col-wise)
par(mfcol=c(1,1))  # default

# ?par check more options

par("mar")  # check the current value of a given attribute, in this case the margins' size

Scatter plots

xs <- seq(-2,2,.05)
plot(xs,dnorm(xs),xlab="",ylab="", pch=20)

lines(c(-2,2),c(.05,.4), col="green", lwd=2) #  add a line to the current plot
points(seq(-2,2,length.out=10),rep(.2,10),pch=20, col="blue")     # add points to the plot
text(-1,.1,"extra text")       # add labels to the plot
title(main="title",sub="subtitle",xlab="x-axis",ylab="y-axis") # add annotations to plot
legend("topleft",c("a label","other label"), col=1:2, pch=20)  # add a legend
axis(4)               # add axis to the other edges
mtext("more text",3)  # (1=bottom, 2=left, 3=top, 4=right).

x <- seq(0,1,.025)
plot(x,dbeta(x,4,2), type="l")  # unite points using a line
curve(dbeta(x,2,4),from=0,to=1,type="l",col="red",add=TRUE) # add a new curve to the plot
abline(h=seq(0,2,.25),v=seq(0,1,.2),col="grey") # add a grid

# making several plots at once
xs <- seq(0,1,.05)
parameters <- c(.5,1,2,5)
par(mfrow=c(length(parameters),length(parameters)))
for(a in parameters) {
  for(b in parameters) {
    plot(xs,dbeta(xs,a,b), type="l", col="red",
         xlab=paste("beta(",a,",",b,")"), ylab="density")
  }
}

x <- rnorm(100)     
y <- x + rnorm(100) 
plot(x,y,type="n")  # plots everything except the data

#let's say that the first half of (x,y) are males, and the last half are females

#define levels for that
gender = gl(2,50,labels=c("male","female"))
#plot male points in blue
points(x[gender=="male"], y[gender=="male"], col="blue")
#plot male points in red
points(x[gender=="female"], y[gender=="female"], col="red")
abline(lm(x~y),col="purple")

# points can have size too:
set.seed(4321)
n.trees <- 100
tree.data <- data.frame(size   = floor(rexp(n.trees, 1/50)),
                        value  = rnorm(n.trees,1000,55),
                        age    = floor(rexp(n.trees, 1/50)))
tree.data$weight <- tree.data$age/50 * floor(rexp(n.trees, 1/150))

head(tree.data)

##   size     value age weight
## 1   51 1002.3564  40 149.60
## 2    1  967.6719   4   2.96
## 3   26  942.8321  47 109.98
## 4   25 1008.5532  16  14.08
## 5   29  991.2168  62 219.48
## 6   15 1102.1071  36  18.72

plot(tree.data$value, tree.data$weight, cex=log(tree.data$size+1),
     xlab="dollars", ylab="tons",main="Size of trees defined by point area")

# we can also divide trees by age assigning different colors
f <- function(age) { 
  if (age<50)
    return ("YOUNG")
  else if (age<90)
    return ("ADULT")
  else 
    return ("OLD")
}
tree.data$age.cat <- Map(f,tree.data$age)
head(tree.data)

##   size     value age weight age.cat
## 1   51 1002.3564  40 149.60   YOUNG
## 2    1  967.6719   4   2.96   YOUNG
## 3   26  942.8321  47 109.98   YOUNG
## 4   25 1008.5532  16  14.08   YOUNG
## 5   29  991.2168  62 219.48   ADULT
## 6   15 1102.1071  36  18.72   YOUNG

plot(tree.data$value, tree.data$weight, 
     xlab="dollars", ylab="tons", type="n",main="Size of trees defined by point area")
colors <- c("blue","green","red")
labels <- c("YOUNG","ADULT","OLD")
for(i in 1:3 ) {
  points(tree.data$value[tree.data$age.cat==labels[i]], 
         tree.data$weight[tree.data$age.cat==labels[i]], 
         cex=log(tree.data$size[tree.data$age.cat==labels[i]]+1), col=colors[i])
}
legend("topleft", tolower(labels), col = colors, pch = 20)  # add a legend

How to present and compute areas for a range of, say, a normal distribution

mean <- 0    # mean parameter 
sd   <- 1    # sd parameter
lb   <- -1.5 # lower bound
ub   <- 1.25 # upper bound

x  <- seq(-4, 4, length=101)
hx <- dnorm(x, mean, sd)

plot(x, hx, type="n", xlab="y", ylab="Density", main="Normal Distribution")
lines(x, hx)
# polygon draws the polygons whose vertices are given in x and y
i <- x >= lb & x <= ub
polygon(c(lb,    lb,            x[i],        ub,         ub), 
        c(0,dnorm(lb, mean, sd),hx[i],dnorm(ub, mean, sd),0), col="red")

area <- pnorm(ub, mean, sd) - pnorm(lb, mean, sd)
result <- paste("P(",lb,"< Y <",ub,") =", signif(area, digits=3))
# place label on top of graph
mtext(result,3)

# Another eg:
lambda <- 1/20 # sd parameter
lb   <- 0      # lower bound
ub   <- 60     # upper bound

x  <- seq(lb,ub*1.5,length=100)
hx <- dexp(x,lambda)

plot(x, hx, type="n", xlab="y", ylab="Density")
lines(x, hx)
# polygon draws the polygons whose vertices are given in x and y
i <- x >= lb & x <= ub
polygon(c(lb,x[i],ub), c(0,hx[i],0), col="red")

area <- pexp(ub, 1/20) - pexp(lb, 1/20)
result <- paste("P(",lb,"< Y <",ub,") =", signif(area, digits=6))
# place label on top of graph
mtext(result,3)
title(bquote(paste("Exponential Distribution with ", lambda, "= 1/20")))

Using a gradient [ref]:

x  <- seq(-3,3,0.01)
y1 <- dnorm(x,0,1)
y2 <- 0.5*dnorm(x,0,1)

x.shade <- seq(-2,1,0.01)

plot(x,y1,type="l",bty="L",xlab="X",ylab="dnorm(X)")
points(x,y2,type="l",col="gray")
l <- length(x.shade)
color <- heat.colors(l)
for (i in 1:l) {
  polygon(c(x.shade[i],rev(x.shade[i])),
          c(dnorm(x.shade[i],0,1),0.5*dnorm(rev(x.shade[i]),0,1)),
          border=color[i],col=NA)
}

Histogram, Boxplot and Barplot

Histogram shows the distribution of observations.

normal.data <- rnorm(200,mean=2,sd=1)

hist(normal.data, breaks=25, freq=FALSE) # freq decides between frequency vs. density
rug(normal.data)                         # shows actual data points
# A density plot captures the distribution shape by smoothing the histogram. You can specify the amount/type of smoothing.
lines(density(normal.data,bw=.1),col="black",lwd=2)  # bw defines the density smoothness
lines(density(normal.data),col="red",lwd=2)
lines(density(normal.data,bw=.5),col="blue",lwd=2)

boxplot(normal.data)

# Let's make some fake data:
some.data <- data.frame(name=sample(letters,26), 
                        age=signif(rnorm(26,45,10),2))
some.data$salary=(some.data$age/45)*signif(runif(26,1000,2000),4)
some.data$profession=sample(gl(2, 13, labels = c("Guard", "Driver")),26)
head(some.data)

##   name age   salary profession
## 1    e  52 1192.533     Driver
## 2    h  45 1062.000      Guard
## 3    i  33 1108.067     Driver
## 4    u  54 1501.200      Guard
## 5    d  48 1895.467     Driver
## 6    g  40 1456.889     Driver

# we can boxplot this stuff divided by a category, say profession:
boxplot(some.data$salary ~ some.data$profession)

# we can try do the same thing with age, but we need to cut some intervals first
c1 <- cut(some.data$age, breaks = seq(25,65,by=10))
table(c1)

## c1
## (25,35] (35,45] (45,55] (55,65] 
##       6       9       7       3

some.data$age.interval <- c1
boxplot(some.data$salary ~ some.data$age.interval, xlab="ages", ylab="wages")

size <- 200
another.data <- data.frame(id=sample(1:size), 
                           value=sample(seq(100,900,by=100),size,rep=TRUE))
head(another.data)

##    id value
## 1  93   200
## 2 192   100
## 3  65   800
## 4  25   200
## 5 117   500
## 6  44   100

# to show a histogram of the frequency of values, we use barplot
barplot(table(another.data$value),xlab="frequency", ylab="value",
        horiz=TRUE,xlim=c(0,5+max(table(another.data$value))))

# another way to show it
records <- matrix(c(2000:2019,sample(1:100,20)), nrow=2, byrow=TRUE)
bp <- barplot(records[2,], names.arg=records[1,], las=3, ylim=c(0,max(records[2,])+20))
text(bp, records[2,], labels = records[2,], pos = 3, cex=.7)

Labels with Math

plot(1:5,1:5,pch=19,xlab="",
     main=expression(
       paste(
         'exp', 
         bgroup("[", frac(- n * bar(x) ** 2, 2 * sigma ** 2),"]")) <= c ),
     sub=expression(
       paste(
         "1 - ", 
         Phi, 
         bgroup("[", c + frac(mu[0]-mu, lambda/sqrt(n)), "]"), 
         sep = ""))

)

3D perspectives

rotate       <- (-45)  # parameters used for 3D plots
tilt         <- 30
parallelness <- 5.0
shadeval     <- 0.05
perspcex     <- 0.7
ncontours    <- 12

xs <- seq(0,1,.025)
ys <- seq(0,1,.025)
f <- function(x,y) {
  dnorm(x,.5,.25) + dnorm(y,.5,.125)
}
zs <- outer(xs,ys,f) # if  f cannot be directly applied to 'outer' use Vectorize(f) instead

persp(xs , ys , zs,
      xlab="x" , ylab="y" , zlab="f(x,y)" , 
      main="Main Title" , cex=perspcex, lwd=0.1  , 
      xlim=c(0,1) , ylim=c(0,1) , zlim=c(-1,4),
      theta=rotate , phi=tilt , d=parallelness ,
      shade=shadeval)

## Warning in persp.default(xs, ys, zs, xlab = "x", ylab = "y", zlab =
## "f(x,y)", : surface extends beyond the box

# Also the same function in a contour map:
contour(xs , ys , zs,
        main=bquote(" "), levels=signif(seq(0,4,length=ncontours),3) ,
        drawlabels=FALSE ,xlab="x" , ylab="y" )
# other ways to 3D plot:
library(aws)

## Loading required package: awsMethods

## 
## Use the function setCores() to change the number of CPU cores.

## Loading required package: gsl

library(lattice)

xs <- rnorm(100000,0,1)
ys <- rpois(100000,3) 
pts <- cbind(xs,ys)
delta <- 30
bins <- binning(pts,NULL,nbins=c(delta,delta)) # bin data into little squares
freqs <- matrix(bins$table.freq,delta,delta)   # convert table into matrix
plot(wireframe(freqs, zoom = .9, lwd = 0.01, 
               xlab="x",ylab="y",zlab="z",
               scales=list(arrows=FALSE,distance=1,tick.number=8,draw=TRUE),
               screen = list(z = 75, x = -70, y = 3), # screen rotation
               drape = TRUE, colorkey = FALSE))

# the following eg shows the graph of a given 2D function
require(lattice)
x <- seq(-pi, pi, len = 20)
y <- seq(-pi, pi, len = 20)
g <- expand.grid(x = x, y = y)
g$z <- sin(sqrt(g$x^2 + g$y^2)) # insert funtion here
print(wireframe(z ~ x * y, g, drape = TRUE,
                aspect = c(3,1), colorkey = TRUE))

# to draw 3D histograms, execute the next two functions:
library(rgl)

binplot.3d <- function(x,y,z,alpha=1,topcol="#ff0000",sidecol="#aaaaaa") {
  
  save <- par3d(skipRedraw=TRUE)
  on.exit(par3d(save))
  
  x1<-c(rep(c(x[1],x[2],x[2],x[1]),3),rep(x[1],4),rep(x[2],4))
  z1<-c(rep(0,4),rep(c(0,0,z,z),4))
  y1<-c(y[1],y[1],y[2],y[2],rep(y[1],4),rep(y[2],4),rep(c(y[1],y[2],y[2],y[1]),2))
  x2<-c(rep(c(x[1],x[1],x[2],x[2]),2),rep(c(x[1],x[2],rep(x[1],3),rep(x[2],3)),2))
  z2<-c(rep(c(0,z),4),rep(0,8),rep(z,8) )
  y2<-c(rep(y[1],4),rep(y[2],4),rep(c(rep(y[1],3),rep(y[2],3),y[1],y[2]),2) )
  rgl.quads(x1,z1,y1,col=rep(sidecol,each=4),alpha=alpha)
  rgl.quads(c(x[1],x[2],x[2],x[1]),rep(z,4),c(y[1],y[1],y[2],y[2]),
            col=rep(topcol,each=4),alpha=1) 
  rgl.lines(x2,z2,y2,col="#000000")
}

hist3d <- function(x,y=NULL,nclass="auto",alpha=1,col="#ff0000",scale=10) {
  
  save <- par3d(skipRedraw=TRUE)
  on.exit(par3d(save))

  xy <- xy.coords(x,y)
  x <- xy$x
  y <- xy$y
  n<-length(x)
  if (nclass == "auto") { nclass<-ceiling(sqrt(nclass.Sturges(x))) }
  breaks.x <- seq(min(x),max(x),length=(nclass+1))
  breaks.y <- seq(min(y),max(y),length=(nclass+1))
  z<-matrix(0,(nclass),(nclass))
  for (i in 1:nclass) 
  {
    for (j in 1:nclass) 
    {
      z[i, j] <- (1/n)*sum(x < breaks.x[i+1] & y < breaks.y[j+1] & 
                             x >= breaks.x[i] & y >= breaks.y[j])
      binplot.3d(c(breaks.x[i],breaks.x[i+1]),c(breaks.y[j],breaks.y[j+1]),
                 scale*z[i,j],alpha=alpha,topcol=col)
    }
  }
}

# and then try it. Notice that a new graphic window will open, which can be manipulated by the mouse

#eg of use:
rho  <- .9
std <- sqrt(1-rho^2)

nsim <- 10000
X <- c(rnorm(1), rep(NA,nsim-1))
Y <- c(rnorm(1), rep(NA,nsim-1))

for(i in 2:nsim) {
  X[i] <- rnorm(1, rho*Y[i-1], std)
  Y[i] <- rnorm(1, rho*X[i],   std)
}

# draw 3d histogram
hist3d(X,Y,nclass=30,col="cyan",scale=100)

Confidence Intervals

cf. stackoverflow

set.seed(101)
x <- seq(-2,2,length.out=20)
df <- data.frame(x = x,
                 y = x + rnorm(20,0,1))

plot(y ~ x, data = df)

# model
mod <- lm(y ~ x, data = df)

# predicts + interval
newx <- seq(min(df$x), max(df$x), length.out=100)
preds <- predict(mod, newdata = data.frame(x=newx), 
                 interval = 'confidence')

# plot
plot(y ~ x, data = df, type = 'n')
# add fill
polygon(c(rev(newx), newx), c(rev(preds[ ,3]), preds[ ,2]), col = 'grey80', border = NA)
# model
abline(mod)
# intervals
lines(newx, preds[ ,3], lty = 'dashed', col = 'red')
lines(newx, preds[ ,2], lty = 'dashed', col = 'red')

Graphical Tools

João Neto

May 2014

Scatter plots

Histogram, Boxplot and Barplot

Labels with Math

3D perspectives

Confidence Intervals