library(dplyr)
library(MASS)  # get a dataset for manipulation

my.data <- tbl_df(Boston)  # wraps a local data frame for better printing
my.data
## Source: local data frame [506 x 14]
## 
##       crim   zn indus chas   nox    rm   age   dis rad tax ptratio black
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.090   1 296    15.3 396.9
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.967   2 242    17.8 396.9
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.967   2 242    17.8 392.8
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.062   3 222    18.7 394.6
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.062   3 222    18.7 396.9
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.062   3 222    18.7 394.1
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.561   5 311    15.2 395.6
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.950   5 311    15.2 396.9
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.082   5 311    15.2 386.6
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.592   5 311    15.2 386.7
## ..     ...  ...   ...  ...   ...   ...   ...   ... ... ...     ...   ...
## Variables not shown: lstat (dbl), medv (dbl)

Verbs

These correspond to basic actions of data manipulation.

Filter rows with filter()

filter(my.data, rad==4)
## Source: local data frame [110 x 14]
## 
##      crim zn indus chas   nox    rm  age   dis rad tax ptratio black lstat
## 1  0.6298  0  8.14    0 0.538 5.949 61.8 4.707   4 307      21 396.9  8.26
## 2  0.6380  0  8.14    0 0.538 6.096 84.5 4.462   4 307      21 380.0 10.26
## 3  0.6274  0  8.14    0 0.538 5.834 56.5 4.499   4 307      21 395.6  8.47
## 4  1.0539  0  8.14    0 0.538 5.935 29.3 4.499   4 307      21 386.9  6.58
## 5  0.7842  0  8.14    0 0.538 5.990 81.7 4.258   4 307      21 386.8 14.67
## 6  0.8027  0  8.14    0 0.538 5.456 36.6 3.796   4 307      21 289.0 11.69
## 7  0.7258  0  8.14    0 0.538 5.727 69.5 3.796   4 307      21 390.9 11.28
## 8  1.2518  0  8.14    0 0.538 5.570 98.1 3.798   4 307      21 376.6 21.02
## 9  0.8520  0  8.14    0 0.538 5.965 89.2 4.012   4 307      21 392.5 13.83
## 10 1.2325  0  8.14    0 0.538 6.142 91.7 3.977   4 307      21 396.9 18.72
## ..    ... ..   ...  ...   ...   ...  ...   ... ... ...     ...   ...   ...
## Variables not shown: medv (dbl)
filter(my.data, rad==1 & tax>320)
## Source: local data frame [3 x 14]
## 
##      crim zn indus chas   nox    rm  age   dis rad tax ptratio black lstat
## 1 0.02498  0  1.89    0 0.518 6.540 59.7 6.267   1 422    15.9 390.0  8.65
## 2 0.02899 40  1.25    0 0.429 6.939 34.5 8.792   1 335    19.7 389.9  5.89
## 3 0.06211 40  1.25    0 0.429 6.490 44.4 8.792   1 335    19.7 396.9  5.98
## Variables not shown: medv (dbl)
filter(my.data, rad==1 & (tax<200|tax>300))
## Source: local data frame [8 x 14]
## 
##      crim zn indus chas    nox    rm  age   dis rad tax ptratio black
## 1 0.04819 80  3.64    0 0.3920 6.108 32.0 9.220   1 315    16.4 392.9
## 2 0.03548 80  3.64    0 0.3920 5.876 19.1 9.220   1 315    16.4 395.2
## 3 0.01501 90  1.21    1 0.4010 7.923 24.8 5.885   1 198    13.6 395.5
## 4 0.05023 35  6.06    0 0.4379 5.706 28.4 6.641   1 304    16.9 394.0
## 5 0.03466 35  6.06    0 0.4379 6.031 23.3 6.641   1 304    16.9 362.2
## 6 0.02498  0  1.89    0 0.5180 6.540 59.7 6.267   1 422    15.9 390.0
## 7 0.02899 40  1.25    0 0.4290 6.939 34.5 8.792   1 335    19.7 389.9
## 8 0.06211 40  1.25    0 0.4290 6.490 44.4 8.792   1 335    19.7 396.9
## Variables not shown: lstat (dbl), medv (dbl)

Arrange rows with arrange()

arrange(my.data, rad, tax) # order first by 'rad' then by 'tax'
## Source: local data frame [506 x 14]
## 
##       crim zn indus chas   nox    rm  age   dis rad tax ptratio black
## 1  0.01501 90  1.21    1 0.401 7.923 24.8 5.885   1 198    13.6 395.5
## 2  0.01965 80  1.76    0 0.385 6.230 31.5 9.089   1 241    18.2 341.6
## 3  0.02187 60  2.93    0 0.401 6.800  9.9 6.220   1 265    15.6 393.4
## 4  0.01439 60  2.93    0 0.401 6.604 18.8 6.220   1 265    15.6 376.7
## 5  0.06263  0 11.93    0 0.573 6.593 69.1 2.479   1 273    21.0 392.0
## 6  0.04527  0 11.93    0 0.573 6.120 76.7 2.288   1 273    21.0 396.9
## 7  0.06076  0 11.93    0 0.573 6.976 91.0 2.167   1 273    21.0 396.9
## 8  0.10959  0 11.93    0 0.573 6.794 89.3 2.389   1 273    21.0 393.4
## 9  0.04741  0 11.93    0 0.573 6.030 80.8 2.505   1 273    21.0 396.9
## 10 0.01301 35  1.52    0 0.442 7.241 49.3 7.038   1 284    15.5 394.7
## ..     ... ..   ...  ...   ...   ...  ...   ... ... ...     ...   ...
## Variables not shown: lstat (dbl), medv (dbl)
arrange(my.data, rad, desc(tax), age)
## Source: local data frame [506 x 14]
## 
##       crim zn indus chas    nox    rm  age   dis rad tax ptratio black
## 1  0.02498  0  1.89    0 0.5180 6.540 59.7 6.267   1 422    15.9 390.0
## 2  0.02899 40  1.25    0 0.4290 6.939 34.5 8.792   1 335    19.7 389.9
## 3  0.06211 40  1.25    0 0.4290 6.490 44.4 8.792   1 335    19.7 396.9
## 4  0.03548 80  3.64    0 0.3920 5.876 19.1 9.220   1 315    16.4 395.2
## 5  0.04819 80  3.64    0 0.3920 6.108 32.0 9.220   1 315    16.4 392.9
## 6  0.03466 35  6.06    0 0.4379 6.031 23.3 6.641   1 304    16.9 362.2
## 7  0.05023 35  6.06    0 0.4379 5.706 28.4 6.641   1 304    16.9 394.0
## 8  0.01096 55  2.25    0 0.3890 6.453 31.9 7.307   1 300    15.3 394.7
## 9  0.00632 18  2.31    0 0.5380 6.575 65.2 4.090   1 296    15.3 396.9
## 10 0.00906 90  2.97    0 0.4000 7.088 20.8 7.307   1 285    15.3 394.7
## ..     ... ..   ...  ...    ...   ...  ...   ... ... ...     ...   ...
## Variables not shown: lstat (dbl), medv (dbl)

Select columns with select()

dplyr::select(my.data, rad, age) # explicit package since MASS masked dplyr's 'select'
## Source: local data frame [506 x 2]
## 
##    rad   age
## 1    1  65.2
## 2    2  78.9
## 3    2  61.1
## 4    3  45.8
## 5    3  54.2
## 6    3  58.7
## 7    5  66.6
## 8    5  96.1
## 9    5 100.0
## 10   5  85.9
## .. ...   ...
dplyr::select(my.data, nox:dis)  # select data frame features from 'nox' to 'dis' 
## Source: local data frame [506 x 4]
## 
##      nox    rm   age   dis
## 1  0.538 6.575  65.2 4.090
## 2  0.469 6.421  78.9 4.967
## 3  0.469 7.185  61.1 4.967
## 4  0.458 6.998  45.8 6.062
## 5  0.458 7.147  54.2 6.062
## 6  0.458 6.430  58.7 6.062
## 7  0.524 6.012  66.6 5.561
## 8  0.524 6.172  96.1 5.950
## 9  0.524 5.631 100.0 6.082
## 10 0.524 6.004  85.9 6.592
## ..   ...   ...   ...   ...
dplyr::select(my.data, starts_with('c')) # select columns that start with 'c'
## Source: local data frame [506 x 2]
## 
##       crim chas
## 1  0.00632    0
## 2  0.02731    0
## 3  0.02729    0
## 4  0.03237    0
## 5  0.06905    0
## 6  0.02985    0
## 7  0.08829    0
## 8  0.14455    0
## 9  0.21124    0
## 10 0.17004    0
## ..     ...  ...
dplyr::select(my.data, matches('^[cd]')) # select columns that start with 'c' or 'd' (regular expression)
## Source: local data frame [506 x 3]
## 
##       crim chas   dis
## 1  0.00632    0 4.090
## 2  0.02731    0 4.967
## 3  0.02729    0 4.967
## 4  0.03237    0 6.062
## 5  0.06905    0 6.062
## 6  0.02985    0 6.062
## 7  0.08829    0 5.561
## 8  0.14455    0 5.950
## 9  0.21124    0 6.082
## 10 0.17004    0 6.592
## ..     ...  ...   ...
dplyr::select(my.data, -matches('^[cd]')) # drop columns that start with 'c' or 'd' (regular expression)
## Source: local data frame [506 x 11]
## 
##      zn indus   nox    rm   age rad tax ptratio black lstat medv
## 1  18.0  2.31 0.538 6.575  65.2   1 296    15.3 396.9  4.98 24.0
## 2   0.0  7.07 0.469 6.421  78.9   2 242    17.8 396.9  9.14 21.6
## 3   0.0  7.07 0.469 7.185  61.1   2 242    17.8 392.8  4.03 34.7
## 4   0.0  2.18 0.458 6.998  45.8   3 222    18.7 394.6  2.94 33.4
## 5   0.0  2.18 0.458 7.147  54.2   3 222    18.7 396.9  5.33 36.2
## 6   0.0  2.18 0.458 6.430  58.7   3 222    18.7 394.1  5.21 28.7
## 7  12.5  7.87 0.524 6.012  66.6   5 311    15.2 395.6 12.43 22.9
## 8  12.5  7.87 0.524 6.172  96.1   5 311    15.2 396.9 19.15 27.1
## 9  12.5  7.87 0.524 5.631 100.0   5 311    15.2 386.6 29.93 16.5
## 10 12.5  7.87 0.524 6.004  85.9   5 311    15.2 386.7 17.10 18.9
## ..  ...   ...   ...   ...   ... ... ...     ...   ...   ...  ...

Rename columns with rename()

rename(my.data, Z.N=zn)
## Source: local data frame [506 x 14]
## 
##       crim  Z.N indus chas   nox    rm   age   dis rad tax ptratio black
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.090   1 296    15.3 396.9
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.967   2 242    17.8 396.9
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.967   2 242    17.8 392.8
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.062   3 222    18.7 394.6
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.062   3 222    18.7 396.9
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.062   3 222    18.7 394.1
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.561   5 311    15.2 395.6
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.950   5 311    15.2 396.9
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.082   5 311    15.2 386.6
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.592   5 311    15.2 386.7
## ..     ...  ...   ...  ...   ...   ...   ...   ... ... ...     ...   ...
## Variables not shown: lstat (dbl), medv (dbl)

Add new columns (that are functions of existing columns) with mutate()

mutate(my.data, twice.age = 2*age, new.col = sqrt(twice.age)) # can refer to new columns of the spot
## Source: local data frame [506 x 16]
## 
##       crim   zn indus chas   nox    rm   age   dis rad tax ptratio black
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.090   1 296    15.3 396.9
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.967   2 242    17.8 396.9
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.967   2 242    17.8 392.8
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.062   3 222    18.7 394.6
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.062   3 222    18.7 396.9
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.062   3 222    18.7 394.1
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.561   5 311    15.2 395.6
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.950   5 311    15.2 396.9
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.082   5 311    15.2 386.6
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.592   5 311    15.2 386.7
## ..     ...  ...   ...  ...   ...   ...   ...   ... ... ...     ...   ...
## Variables not shown: lstat (dbl), medv (dbl), twice.age (dbl), new.col
##   (dbl)
transmute(my.data, twice.age = 2*age, new.col = sqrt(twice.age)) # transmute is like mutate bt only keeps the new columns
## Source: local data frame [506 x 2]
## 
##    twice.age new.col
## 1      130.4  11.419
## 2      157.8  12.562
## 3      122.2  11.054
## 4       91.6   9.571
## 5      108.4  10.412
## 6      117.4  10.835
## 7      133.2  11.541
## 8      192.2  13.864
## 9      200.0  14.142
## 10     171.8  13.107
## ..       ...     ...

Summarise values with summarise()

summarise(my.data, mean.age = mean(age, na.rm = TRUE))
## Source: local data frame [1 x 1]
## 
##   mean.age
## 1    68.57

Select rows by position with slice():

slice(my.data, c(2:6,8))
## Source: local data frame [6 x 14]
## 
##      crim   zn indus chas   nox    rm  age   dis rad tax ptratio black
## 1 0.02731  0.0  7.07    0 0.469 6.421 78.9 4.967   2 242    17.8 396.9
## 2 0.02729  0.0  7.07    0 0.469 7.185 61.1 4.967   2 242    17.8 392.8
## 3 0.03237  0.0  2.18    0 0.458 6.998 45.8 6.062   3 222    18.7 394.6
## 4 0.06905  0.0  2.18    0 0.458 7.147 54.2 6.062   3 222    18.7 396.9
## 5 0.02985  0.0  2.18    0 0.458 6.430 58.7 6.062   3 222    18.7 394.1
## 6 0.14455 12.5  7.87    0 0.524 6.172 96.1 5.950   5 311    15.2 396.9
## Variables not shown: lstat (dbl), medv (dbl)
slice(my.data, n()) # last record
## Source: local data frame [1 x 14]
## 
##      crim zn indus chas   nox   rm  age   dis rad tax ptratio black lstat
## 1 0.04741  0 11.93    0 0.573 6.03 80.8 2.505   1 273      21 396.9  7.88
## Variables not shown: medv (dbl)

Extract distinct (unique) rows with distict()

d <- dplyr::select(my.data, tax)
arrange( distinct(d, tax), tax)
## Source: local data frame [66 x 1]
## 
##    tax
## 1  187
## 2  188
## 3  193
## 4  198
## 5  216
## 6  222
## 7  223
## 8  224
## 9  226
## 10 233
## 11 241
## 12 242
## 13 243
## 14 244
## 15 245
## 16 247
## 17 252
## 18 254
## 19 255
## 20 256
## 21 264
## 22 265
## 23 270
## 24 273
## 25 276
## 26 277
## 27 279
## 28 280
## 29 281
## 30 284
## 31 285
## 32 287
## 33 289
## 34 293
## 35 296
## 36 300
## 37 304
## 38 305
## 39 307
## 40 311
## 41 313
## 42 315
## 43 329
## 44 330
## 45 334
## 46 335
## 47 337
## 48 345
## 49 348
## 50 351
## 51 352
## 52 358
## 53 370
## 54 384
## 55 391
## 56 398
## 57 402
## 58 403
## 59 411
## 60 422
## 61 430
## 62 432
## 63 437
## 64 469
## 65 666
## 66 711

Create samples with sample_n() and sample_f()

sample_n(my.data, 6)      # a sample of 6 records
## Source: local data frame [6 x 14]
## 
##       crim zn indus chas   nox    rm   age   dis rad tax ptratio  black
## 1 13.67810  0 18.10    0 0.740 5.935  87.9 1.821  24 666    20.2  68.95
## 2  0.03113  0  4.39    0 0.442 6.014  48.5 8.014   3 352    18.8 385.64
## 3  0.03548 80  3.64    0 0.392 5.876  19.1 9.220   1 315    16.4 395.18
## 4 16.81180  0 18.10    0 0.700 5.277  98.1 1.426  24 666    20.2 396.90
## 5  0.03150 95  1.47    0 0.403 6.975  15.3 7.653   3 402    17.0 396.90
## 6  1.62864  0 21.89    0 0.624 5.019 100.0 1.439   4 437    21.2 396.90
## Variables not shown: lstat (dbl), medv (dbl)
sample_frac(my.data, .03) # a sample of 3% of the records
## Source: local data frame [15 x 14]
## 
##        crim zn indus chas   nox    rm  age   dis rad tax ptratio  black
## 1   1.05393  0  8.14    0 0.538 5.935 29.3 4.499   4 307    21.0 386.85
## 2   6.71772  0 18.10    0 0.713 6.749 92.6 2.324  24 666    20.2   0.32
## 3   0.57529  0  6.20    0 0.507 8.337 73.3 3.838   8 307    17.4 385.91
## 4   7.05042  0 18.10    0 0.614 6.103 85.1 2.022  24 666    20.2   2.52
## 5   0.06664  0  4.05    0 0.510 6.546 33.1 3.132   5 296    16.6 390.96
## 6   0.08664 45  3.44    0 0.437 7.178 26.3 6.480   5 398    15.2 390.49
## 7   0.27957  0  9.69    0 0.585 5.926 42.6 2.382   6 391    19.2 396.90
## 8   0.61154 20  3.97    0 0.647 8.704 86.9 1.801   5 264    13.0 389.70
## 9   0.76162 20  3.97    0 0.647 5.560 62.8 1.986   5 264    13.0 392.40
## 10  0.01778 95  1.47    0 0.403 7.135 13.9 7.653   3 402    17.0 384.30
## 11  3.69695  0 18.10    0 0.718 4.963 91.4 1.752  24 666    20.2 316.03
## 12  0.02498  0  1.89    0 0.518 6.540 59.7 6.267   1 422    15.9 389.96
## 13  0.09164  0 10.81    0 0.413 6.065  7.8 5.287   4 305    19.2 390.91
## 14  0.25915  0 21.89    0 0.624 5.693 96.0 1.788   4 437    21.2 392.11
## 15 13.07510  0 18.10    0 0.580 5.713 56.7 2.824  24 666    20.2 396.90
## Variables not shown: lstat (dbl), medv (dbl)
sample_frac(my.data, .03, replace=TRUE) # a bootstrap sample of 3% of the records
## Source: local data frame [15 x 14]
## 
##        crim zn indus chas   nox    rm   age   dis rad tax ptratio  black
## 1   0.06664  0  4.05    0 0.510 6.546  33.1 3.132   5 296    16.6 390.96
## 2  37.66190  0 18.10    0 0.679 6.202  78.7 1.863  24 666    20.2  18.82
## 3   0.10469 40  6.41    1 0.447 7.267  49.0 4.787   4 254    17.6 389.25
## 4   0.04932 33  2.18    0 0.472 6.849  70.3 3.183   7 222    18.4 396.90
## 5   0.17331  0  9.69    0 0.585 5.707  54.0 2.382   6 391    19.2 396.90
## 6   0.04932 33  2.18    0 0.472 6.849  70.3 3.183   7 222    18.4 396.90
## 7   6.28807  0 18.10    0 0.740 6.341  96.4 2.072  24 666    20.2 318.01
## 8   0.66351 20  3.97    0 0.647 7.333 100.0 1.895   5 264    13.0 383.29
## 9   3.53501  0 19.58    1 0.871 6.152  82.6 1.746   5 403    14.7  88.01
## 10  0.22438  0  9.69    0 0.585 6.027  79.7 2.498   6 391    19.2 396.90
## 11  0.17899  0  9.69    0 0.585 5.670  28.8 2.799   6 391    19.2 393.29
## 12 25.04610  0 18.10    0 0.693 5.987 100.0 1.589  24 666    20.2 396.90
## 13  0.08187  0  2.89    0 0.445 7.820  36.9 3.495   2 276    18.0 393.53
## 14  2.33099  0 19.58    0 0.871 5.186  93.8 1.530   5 403    14.7 356.99
## 15  0.35809  0  6.20    1 0.507 6.951  88.5 2.862   8 307    17.4 391.70
## Variables not shown: lstat (dbl), medv (dbl)

There is also an option weight=vector that allows us to sample a weightned sample of the dataset. The vector must have the same size as the number of rows of the dataset (weights are automatically standardised to sum to 1).

Grouped operations

These verbs are useful, but they become really powerful when you combine them with the idea of “group by”, repeating the operation individually on groups of observations within the tbl. In dplyr, you use the group_by() function to describe how to break a dataset down into groups of rows. You can then use the resulting object in the exactly the same functions as above; they’ll automatically work “by group” when the input is a grouped tbl. ref.

taxes  <- group_by(my.data, tax) # group dataset by tax
report <- summarise(taxes, count=n(), mean.age=mean(age, na.rm = TRUE))
arrange(report, count, tax)
## Source: local data frame [66 x 3]
## 
##    tax count mean.age
## 1  187     1   36.100
## 2  198     1   24.800
## 3  226     1   21.900
## 4  241     1   31.500
## 5  244     1   34.200
## 6  255     1   32.000
## 7  256     1   40.500
## 8  280     1   29.700
## 9  285     1   20.800
## 10 313     1   35.700
## 11 351     1   27.700
## 12 422     1   59.700
## 13 469     1   47.600
## 14 242     2   70.000
## 15 252     2   18.800
## 16 265     2   14.350
## 17 315     2   25.550
## 18 334     2   20.700
## 19 335     2   39.450
## 20 337     2   24.450
## 21 348     2   27.050
## 22 352     2   50.400
## 23 370     2   42.250
## 24 402     2   14.600
## 25 411     2   27.200
## 26 245     3   26.333
## 27 293     3   33.267
## 28 345     3   30.400
## 29 358     3   25.833
## 30 430     3   25.067
## 31 243     4   37.800
## 32 247     4   51.500
## 33 279     4   50.325
## 34 281     4   45.700
## 35 305     4    9.525
## 36 216     5   48.620
## 37 223     5   46.080
## 38 254     5   36.880
## 39 273     5   81.380
## 40 289     5   40.160
## 41 711     5   94.260
## 42 329     6   30.917
## 43 188     7   89.071
## 44 222     7   57.157
## 45 270     7   64.171
## 46 284     7   56.643
## 47 300     7   38.957
## 48 311     7   80.686
## 49 193     8   75.487
## 50 287     8   44.775
## 51 296     8   66.825
## 52 391     8   60.925
## 53 233     9   40.656
## 54 276     9   70.011
## 55 432     9   81.744
## 56 224    10   42.330
## 57 330    10   36.450
## 58 277    11   56.873
## 59 384    11   84.591
## 60 264    12   83.558
## 61 398    12   37.442
## 62 304    14   65.757
## 63 437    15   97.173
## 64 403    30   95.163
## 65 307    40   76.918
## 66 666   132   89.805

Some tools:

rads <- group_by(my.data, rad)
summarise(rads, n.taxes = n(), diff.taxes = n_distinct(tax), first.tax=first(tax), last.tax=last(tax))
## Source: local data frame [9 x 5]
## 
##   rad n.taxes diff.taxes first.tax last.tax
## 1   1      20         12       296      273
## 2   2      24          7       242      348
## 3   3      38         11       222      352
## 4   4     110         21       307      711
## 5   5     115         16       311      187
## 6   6      26          4       432      391
## 7   7      17          3       330      222
## 8   8      24          2       284      307
## 9  24     132          1       666      666

We can provide our own functions (slower performance, unless is in C++)

my.f <- function(x) sqrt(x)+1
summarise(rads, f = my.f(mean(age)))
## Source: local data frame [9 x 2]
## 
##   rad      f
## 1   1  7.710
## 2   2  9.048
## 3   3  8.022
## 4   4  8.800
## 5   5  9.320
## 6   6  8.755
## 7   7  7.336
## 8   8  9.207
## 9  24 10.477

When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll-up a dataset:

taxes.rad <- group_by(my.data, tax, rad)
by.rad    <- summarise(taxes.rad, size=n())
arrange(by.rad, tax)
## Source: local data frame [77 x 3]
## Groups: tax
## 
##    tax rad size
## 1  187   5    1
## 2  188   2    7
## 3  193   3    8
## 4  198   1    1
## 5  216   3    1
## 6  216   5    4
## 7  222   3    3
## 8  222   7    4
## 9  223   3    5
## 10 224   4    2
## 11 224   5    8
## 12 226   5    1
## 13 233   3    9
## 14 241   1    1
## 15 242   2    2
## 16 243   4    4
## 17 244   3    1
## 18 245   4    3
## 19 247   3    4
## 20 252   3    2
## 21 254   4    5
## 22 255   4    1
## 23 256   5    1
## 24 264   5   12
## 25 265   1    2
## 26 270   2    4
## 27 270   4    3
## 28 273   1    5
## 29 276   2    5
## 30 276   5    4
## 31 277   4   11
## 32 279   5    4
## 33 280   4    1
## 34 281   4    4
## 35 284   1    1
## 36 284   8    6
## 37 285   1    1
## 38 287   5    8
## 39 289   4    5
## 40 293   6    3
## 41 296   1    1
## 42 296   5    7
## 43 300   1    1
## 44 300   6    6
## 45 304   1    2
## 46 304   4   12
## 47 305   4    4
## 48 307   4   22
## 49 307   8   18
## 50 311   5    7
## 51 313   2    1
## 52 315   1    2
## 53 329   2    3
## 54 329   7    3
## 55 330   7   10
## 56 334   4    2
## 57 335   1    2
## 58 337   4    2
## 59 345   4    3
## 60 348   2    2
## 61 351   4    1
## 62 352   3    2
## 63 358   5    3
## 64 370   5    2
## 65 384   5   11
## 66 391   6    8
## 67 398   5   12
## 68 402   3    2
## 69 403   5   30
## 70 411   4    2
## 71 422   1    1
## 72 430   4    3
## 73 432   6    9
## 74 437   4   15
## 75 469   3    1
## 76 666  24  132
## 77 711   4    5
by.tax    <- summarise(by.rad, sum=sum(size))
arrange(by.tax,tax)
## Source: local data frame [66 x 2]
## 
##    tax sum
## 1  187   1
## 2  188   7
## 3  193   8
## 4  198   1
## 5  216   5
## 6  222   7
## 7  223   5
## 8  224  10
## 9  226   1
## 10 233   9
## 11 241   1
## 12 242   2
## 13 243   4
## 14 244   1
## 15 245   3
## 16 247   4
## 17 252   2
## 18 254   5
## 19 255   1
## 20 256   1
## 21 264  12
## 22 265   2
## 23 270   7
## 24 273   5
## 25 276   9
## 26 277  11
## 27 279   4
## 28 280   1
## 29 281   4
## 30 284   7
## 31 285   1
## 32 287   8
## 33 289   5
## 34 293   3
## 35 296   8
## 36 300   7
## 37 304  14
## 38 305   4
## 39 307  40
## 40 311   7
## 41 313   1
## 42 315   2
## 43 329   6
## 44 330  10
## 45 334   2
## 46 335   2
## 47 337   2
## 48 345   3
## 49 348   2
## 50 351   1
## 51 352   2
## 52 358   3
## 53 370   2
## 54 384  11
## 55 391   8
## 56 398  12
## 57 402   2
## 58 403  30
## 59 411   2
## 60 422   1
## 61 430   3
## 62 432   9
## 63 437  15
## 64 469   1
## 65 666 132
## 66 711   5

Operator %>%

It’s the ability to chain operations together from left to right with the %>% operator. Expression x %>% f(y) turns into f(x, y) so this makes dplyr behave a little like a grammar of data manipulation:

my.data %>% group_by(tax)
## Source: local data frame [506 x 14]
## Groups: tax
## 
##       crim   zn indus chas   nox    rm   age   dis rad tax ptratio black
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.090   1 296    15.3 396.9
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.967   2 242    17.8 396.9
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.967   2 242    17.8 392.8
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.062   3 222    18.7 394.6
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.062   3 222    18.7 396.9
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.062   3 222    18.7 394.1
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.561   5 311    15.2 395.6
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.950   5 311    15.2 396.9
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.082   5 311    15.2 386.6
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.592   5 311    15.2 386.7
## ..     ...  ...   ...  ...   ...   ...   ...   ... ... ...     ...   ...
## Variables not shown: lstat (dbl), medv (dbl)
my.data %>% group_by(tax) %>% summarise(total = sum(dis))
## Source: local data frame [66 x 2]
## 
##    tax   total
## 1  187  12.127
## 2  188  14.243
## 3  193  23.182
## 4  198   5.885
## 5  216  28.475
## 6  222  31.861
## 7  223  21.059
## 8  224  54.421
## 9  226   8.697
## 10 233  50.731
## 11 241   9.089
## 12 242   9.934
## 13 243  27.259
## 14 244   6.336
## 15 245  15.350
## 16 247  17.392
## 17 252  10.802
## 18 254  22.135
## 19 255   5.648
## 20 256   8.325
## 21 264  25.669
## 22 265  12.439
## 23 270  23.967
## 24 273  11.828
## 25 276  30.262
## 26 277  43.442
## 27 279  14.520
## 28 280   8.344
## 29 281  21.603
## 30 284  51.035
## 31 285   7.307
## 32 287  39.492
## 33 289  29.246
## 34 293  21.952
## 35 296  24.954
## 36 300  46.430
## 37 304  54.070
## 38 305  21.149
## 39 307 154.386
## 40 311  42.209
## 41 313   9.188
## 42 315  18.441
## 43 329  38.402
## 44 330  80.281
## 45 334  21.171
## 46 335  17.584
## 47 337  13.223
## 48 345  19.494
## 49 348  12.540
## 50 351   8.535
## 51 352  16.027
## 52 358  23.483
## 53 370  12.197
## 54 384  27.680
## 55 391  20.561
## 56 398  58.775
## 57 402  15.307
## 58 403  54.932
## 59 411  21.421
## 60 422   6.267
## 61 430  16.303
## 62 432  23.013
## 63 437  29.430
## 64 469   7.320
## 65 666 272.086
## 66 711   9.377
my.data %>% group_by(tax) %>% summarise(total = sum(dis)) %>% arrange(desc(total))
## Source: local data frame [66 x 2]
## 
##    tax   total
## 1  666 272.086
## 2  307 154.386
## 3  330  80.281
## 4  398  58.775
## 5  403  54.932
## 6  224  54.421
## 7  304  54.070
## 8  284  51.035
## 9  233  50.731
## 10 300  46.430
## 11 277  43.442
## 12 311  42.209
## 13 287  39.492
## 14 329  38.402
## 15 222  31.861
## 16 276  30.262
## 17 437  29.430
## 18 289  29.246
## 19 216  28.475
## 20 384  27.680
## 21 243  27.259
## 22 264  25.669
## 23 296  24.954
## 24 270  23.967
## 25 358  23.483
## 26 193  23.182
## 27 432  23.013
## 28 254  22.135
## 29 293  21.952
## 30 281  21.603
## 31 411  21.421
## 32 334  21.171
## 33 305  21.149
## 34 223  21.059
## 35 391  20.561
## 36 345  19.494
## 37 315  18.441
## 38 335  17.584
## 39 247  17.392
## 40 430  16.303
## 41 352  16.027
## 42 245  15.350
## 43 402  15.307
## 44 279  14.520
## 45 188  14.243
## 46 337  13.223
## 47 348  12.540
## 48 265  12.439
## 49 370  12.197
## 50 187  12.127
## 51 273  11.828
## 52 252  10.802
## 53 242   9.934
## 54 711   9.377
## 55 313   9.188
## 56 241   9.089
## 57 226   8.697
## 58 351   8.535
## 59 280   8.344
## 60 256   8.325
## 61 469   7.320
## 62 285   7.307
## 63 244   6.336
## 64 422   6.267
## 65 198   5.885
## 66 255   5.648
my.data %>% group_by(tax) %>% summarise(total = sum(dis)) %>% arrange(desc(total)) %>% filter(total < 50) %>% head(7)
## Source: local data frame [7 x 2]
## 
##   tax total
## 1 300 46.43
## 2 277 43.44
## 3 311 42.21
## 4 287 39.49
## 5 329 38.40
## 6 222 31.86
## 7 276 30.26
df <- data.frame(Date=c("2014-01-01" ,"2014-01-01" ,"2014-01-01" ,"2014-01-01" ,"2014-01-02" ,"2014-01-02" ,"2014-01-02" ,"2014-01-03" ,"2014-01-03" ,"2014-01-03"),
                 Col1=c(123,123,124,125,123,126,127,521,123,126))
df
##          Date Col1
## 1  2014-01-01  123
## 2  2014-01-01  123
## 3  2014-01-01  124
## 4  2014-01-01  125
## 5  2014-01-02  123
## 6  2014-01-02  126
## 7  2014-01-02  127
## 8  2014-01-03  521
## 9  2014-01-03  123
## 10 2014-01-03  126
# now: count unique values in Col1 for the each date (that did not repeat in previous date), and add to the previous count

df %>% 
  arrange(Date) %>% 
  filter(!duplicated(Col1)) %>% 
  group_by(Date) %>% 
  summarise(Count=length(Date)) %>% 
  mutate(Count = cumsum(Count))
## Source: local data frame [3 x 2]
## 
##         Date Count
## 1 2014-01-01     3
## 2 2014-01-02     5
## 3 2014-01-03     6

To work with databases cf http://cran.rstudio.com/web/packages/dplyr/vignettes/databases.html