Refs:

ggplot2 implements Wilkinson grammar of graphics that describes and conceptually organizes the features that underlie statistical graphics.

The most important concepts are:

library(ggplot2)
library(gridExtra)  # for presenting plots side by side
FALSE Loading required package: grid
set.seed(101)
d <- diamonds[sample(nrow(diamonds), 100), ]  # read some data for next examples
head(d)
FALSE       carat       cut color clarity depth table price    x    y    z
FALSE 20077  1.00 Very Good     F    VVS2  60.0    62  8553 6.43 6.46 3.87
FALSE 2364   0.90   Premium     J     VS1  62.3    56  3175 6.23 6.19 3.87
FALSE 38279  0.21   Premium     D     VS2  59.1    62   386 3.89 3.86 2.29
FALSE 35474  0.39     Ideal     G    VVS2  61.9    56   902 4.68 4.70 2.90
FALSE 13477  1.15     Ideal     I     VS2  61.7    57  5534 6.72 6.74 4.15
FALSE 16184  1.10   Premium     D     SI1  60.7    55  6468 6.74 6.71 4.08

qplot

qplot is ggplot2’s plot function.

plot1 <- qplot(carat,      price,      data = d)
plot2 <- qplot(log(carat), log(price), data = d)
plot3 <- qplot(carat,      x * y * z,  data = d)  # x*y*z gives the volume
grid.arrange(plot1, plot2, plot3, ncol=3)

qplot automates some aesthetics like how to assign colors and shapes to data:

plot1 <- qplot(carat, price, data = d, colour = color, size= carat)
plot2 <- qplot(carat, price, data = d, shape  = cut)
grid.arrange(plot1, plot2, ncol=2)

Notice that the plots also come with a legend. For each aesthetic attribute there is a scale function mapping the data values to the aesthetic values. Eg, in the left plot, the attribute D was associated with red.

qplot accepts different types of geometric objects, geoms, which will make it produce different types of graphics. The default is geom="point", ie, the scatterplots we’ve seen. Other objects are possible:

plot1 <- qplot(carat, price, data = d, geom=c("point", "smooth")) # default smooth by loess regression
plot2 <- qplot(carat, price, data = d, geom=c("point", "boxplot"))
plot3 <- qplot(carat, price, data = d, geom=c("point", "line")) 
grid.arrange(plot1, plot2, plot3, ncol=3)

Geoms for 1D data:

plot1 <- qplot(carat, data = d, geom="histogram") # continuous values
plot2 <- qplot(color, data = d, geom="bar")       # discrete values
grid.arrange(plot1, plot2, ncol=2)  

plot1 <- qplot(carat, data = d, geom="freqpoly")
plot2 <- qplot(carat, data = d, geom="density") 
grid.arrange(plot1, plot2, ncol=2)  

Smooth geoms

The smooth geom can be used with different regression methods:

plot1 <- qplot(carat, price, data = d, geom=c("point", "smooth"), method="lm")
plot2 <- qplot(carat, price, data = d, geom=c("point", "smooth"), method="lm", formula=y~poly(x,3)) # polynomial regression
grid.arrange(plot1, plot2, ncol=2)

library(splines) # using natural splines
plot3 <- qplot(carat, price, data = d, geom=c("point", "smooth"), method="lm", formula=y~ns(x,5)) 
library(MASS) # for robust regression
plot4 <- qplot(carat, price, data = d, geom=c("point", "smooth"), method="rlm") 
grid.arrange(plot3, plot4, ncol=2)

Jitter and opacity

Sometimes the datapoints are too many and a direct plot is unable to transmit an appropriate perspective of the data. One tool is to jitter the points (add small random noise so that many equal data points are spread around its center) and/or define an amount of opacity, ie, stating how many points there must be at area so that the graphic plots without transparency.

plot1 <- qplot(carat, price, data = diamonds)
plot2 <- qplot(carat, price, data = diamonds, alpha=I(1/50)) # 100 pts for total opacity
grid.arrange(plot1, plot2, ncol=2)