[wikipedia] _In computer science, relational algebra is an offshoot of first-order logic and of algebra of sets concerned with operations over finitary relations, usually made more convenient to work with by identifying the components of a tuple by a name (called attribute) rather than by a numeric column index, which is what is called a relation in database terminology.

The main application of relational algebra is providing a theoretical foundation for relational databases, particularly query languages for such databases, chiefly among which is SQL._

Package sqldf

This package performs SQL queries on R data frames

library(sqldf)
## Warning: package 'sqldf' was built under R version 3.1.1
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.1.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.1.1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.1.1
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.1.1
## Loading required package: RSQLite.extfuns
## Warning: package 'RSQLite.extfuns' was built under R version 3.1.1
set.seed(101)
size = 50
get.name <- function(size)paste(sample(letters,size),collapse="")
mydf <- data.frame(id=1:size,
                   name=as.vector(Map(get.name, rep(5,size)), mode="character"),
                   time=rpois(size,100),
                   size=floor(runif(size,1,6)))
head(mydf)
##   id  name time size
## 1  1 jbrpf   92    5
## 2  2 hoiym  109    3
## 3  3 wryvk  103    4
## 4  4 pufja  107    5
## 5  5 sxfpu   92    1
## 6  6 ubjxo  102    3
sqldf("SELECT * 
       FROM mydf 
       LIMIT 6")
## Loading required package: tcltk
##   id  name time size
## 1  1 jbrpf   92    5
## 2  2 hoiym  109    3
## 3  3 wryvk  103    4
## 4  4 pufja  107    5
## 5  5 sxfpu   92    1
## 6  6 ubjxo  102    3
sqldf("SELECT * 
       FROM mydf 
       WHERE name IN ('megku','qdaym')")
## [1] id   name time size
## <0 rows> (or 0-length row.names)
sqldf("SELECT * 
       FROM mydf 
       WHERE name LIKE '%a%'")  # names with at least one 'a'
##    id  name time size
## 1   4 pufja  107    5
## 2   8 xftau   83    4
## 3  14 sahuw  119    1
## 4  17 rtswa  112    2
## 5  18 yikda   99    2
## 6  21 dajtp  102    4
## 7  22 icarv   92    2
## 8  23 danut  103    1
## 9  30 elpaj  113    4
## 10 34 zhvan   99    5
## 11 38 afcoe  109    3
## 12 39 fahgu  103    1
## 13 43 kumla  112    2
## 14 44 qimva   90    3
## 15 49 gvfea   91    3
sqldf("SELECT id, MAX(time) 
       FROM mydf")
##   id MAX(time)
## 1 26       121
sqldf("SELECT * 
       FROM mydf 
       WHERE time BETWEEN 90 AND 93")
##   id  name time size
## 1  1 jbrpf   92    5
## 2  5 sxfpu   92    1
## 3 22 icarv   92    2
## 4 24 zvmhn   91    3
## 5 33 zrumg   92    5
## 6 44 qimva   90    3
## 7 45 hmlkj   92    2
## 8 49 gvfea   91    3
sqldf("SELECT * 
       FROM mydf 
       WHERE name BETWEEN 'a' AND 'g'")
##    id  name time size
## 1  11 ckqxj   96    4
## 2  21 dajtp  102    4
## 3  23 danut  103    1
## 4  30 elpaj  113    4
## 5  31 dlovc  105    5
## 6  32 fnslt  107    2
## 7  38 afcoe  109    3
## 8  39 fahgu  103    1
## 9  42 cvfrk   99    3
## 10 46 exdrw  101    5
## 11 47 fhgme  105    5
sqldf("SELECT time, count(*) AS n_refs
       FROM mydf 
       GROUP BY time HAVING count(*) > 2")
##   time n_refs
## 1   92      5
## 2   96      3
## 3   99      4
## 4  103      4
## 5  105      3
## 6  112      3
sqldf("SELECT * 
       FROM mydf 
       WHERE time = 103")
##   id  name time size
## 1  3 wryvk  103    4
## 2 10 gbvrc  103    5
## 3 23 danut  103    1
## 4 39 fahgu  103    1
sqldf("SELECT id, time
       FROM mydf 
       WHERE time >= 70 AND time <= 86 
       ORDER BY time;")
##   id time
## 1 35   81
## 2  8   83
## 3 37   84
sqldf("SELECT size, avg(time) as avg_time 
       FROM mydf 
       GROUP BY size;")
##   size avg_time
## 1    1   103.56
## 2    2    99.12
## 3    3    99.30
## 4    4   102.88
## 5    5   101.20
size2 <- 200
mydf2 <- data.frame(id=sample(1:50,size2,replace=TRUE),
                    value=rexp(size2,1/50))
head(mydf2)
##   id   value
## 1 23 34.3517
## 2 26 19.9782
## 3 42  4.1858
## 4 46  0.2366
## 5 38 36.0750
## 6 39 41.6617
sqldf("SELECT count(*) AS n_values
       FROM mydf2 
       GROUP BY id
       LIMIT 12")
##    n_values
## 1         3
## 2         2
## 3         4
## 4         5
## 5         2
## 6         4
## 7         6
## 8         5
## 9         3
## 10        2
## 11        1
## 12        5
sqldf("SELECT id, SUM(value) AS sum_vals
       FROM mydf2 
       GROUP BY id
       HAVING SUM(value) > 300")
##    id sum_vals
## 1   4    321.9
## 2  12    340.2
## 3  18    440.5
## 4  23    395.6
## 5  30    641.2
## 6  31    306.4
## 7  32    311.7
## 8  36    395.9
## 9  40    338.4
## 10 45    542.7
# Let the joins begin!
# INNER JOIN, ie, returns rows when there is at least one match in both tables
sqldf("SELECT mydf.name, mydf2.value 
       FROM mydf JOIN mydf2 
       ON mydf.id = mydf2.id 
       ORDER BY mydf.name 
       LIMIT 12")
##     name   value
## 1  afcoe  36.075
## 2  afcoe  48.312
## 3  afcoe  48.471
## 4  afcoe 136.284
## 5  ckqxj  27.994
## 6  cvfrk   4.186
## 7  cvfrk  42.393
## 8  dajtp  14.256
## 9  dajtp  26.939
## 10 dajtp  38.055
## 11 dajtp  42.254
## 12 dajtp  48.238
# LEFT (OUTER) JOIN keyword returns all rows from the left table (table_name1), 
# even if there are no matches in the right table (table_name2).
sqldf("SELECT mydf.name, mydf2.value 
       FROM mydf LEFT JOIN mydf2 
       ON mydf.id = mydf2.id 
       ORDER BY mydf.name 
       LIMIT 12")  # in this case there is no difference, all id's have values in mydf2
##     name   value
## 1  afcoe  36.075
## 2  afcoe  48.312
## 3  afcoe  48.471
## 4  afcoe 136.284
## 5  ckqxj  27.994
## 6  cvfrk   4.186
## 7  cvfrk  42.393
## 8  dajtp  14.256
## 9  dajtp  26.939
## 10 dajtp  38.055
## 11 dajtp  42.254
## 12 dajtp  48.238
mydf3 <- data.frame(id=sample(51:60,size2,replace=TRUE),
                    value=rexp(size2,1/50))

# UNION combines the result-set of two or more SELECT statements
# It does not include repetitions (for that, use UNION ALL)
sqldf("SELECT mydf2.id FROM mydf2 
       UNION
       SELECT mydf3.id FROM mydf3
       ORDER BY id DESC
       LIMIT 12") 
##    mydf2.id
## 1        60
## 2        59
## 3        58
## 4        57
## 5        56
## 6        55
## 7        54
## 8        53
## 9        52
## 10       51
## 11       50
## 12       49

Reading CSV files

df <- read.csv("effort.csv")
head(df, 10)
##               X setting effort change
## 1       Bolivia      46      0      1
## 2        Brazil      74      0     10
## 3         Chile      89     16     29
## 4      Colombia      77     16     25
## 5     CostaRica      84     21     29
## 6          Cuba      89     15     40
## 7  DominicanRep      68     14     21
## 8       Ecuador      70      6      0
## 9    ElSalvador      60     13     13
## 10    Guatemala      55      9      4
df <- read.csv.sql("effort.csv", "SELECT * FROM file WHERE effort>10")
head(df, 10)
##                  X setting effort change
## 1          "Chile"      89     16     29
## 2       "Colombia"      77     16     25
## 3      "CostaRica"      84     21     29
## 4           "Cuba"      89     15     40
## 5   "DominicanRep"      68     14     21
## 6     "ElSalvador"      60     13     13
## 7        "Jamaica"      87     23     21
## 8         "Panama"      84     19     22
## 9 "TrinidadTobago"      84     15     29
df <- read.csv.sql("effort.csv", "SELECT X, effort FROM file")
head(df, 10)
##                 X effort
## 1       "Bolivia"      0
## 2        "Brazil"      0
## 3         "Chile"     16
## 4      "Colombia"     16
## 5     "CostaRica"     21
## 6          "Cuba"     15
## 7  "DominicanRep"     14
## 8       "Ecuador"      6
## 9    "ElSalvador"     13
## 10    "Guatemala"      9
df <- read.csv.sql("effort.csv", "SELECT X, effort FROM file ORDER BY X DESC")
head(df, 10)
##                   X effort
## 1       "Venezuela"      7
## 2  "TrinidadTobago"     15
## 3            "Peru"      0
## 4        "Paraguay"      3
## 5          "Panama"     19
## 6       "Nicaragua"      0
## 7          "Mexico"      4
## 8         "Jamaica"     23
## 9        "Honduras"      7
## 10          "Haiti"      3
df <- read.csv.sql("effort.csv", "SELECT effort, COUNT(*) AS count FROM file GROUP BY effort")
head(df, 10)
##    effort count
## 1       0     4
## 2       3     2
## 3       4     1
## 4       6     1
## 5       7     2
## 6       9     1
## 7      13     1
## 8      14     1
## 9      15     2
## 10     16     2

Package relations

Data Structures and Algorithms for for k-ary relations with arbitrary domains, featuring relational algebra, predicate functions, and fitters for consensus relations

Check http://cran.r-project.org/web/packages/relations/index.html

library(relations)
## Warning: package 'relations' was built under R version 3.1.1
PersonDF <-
  data.frame(Name = c("Harry", "Sally", "George", "Helena", "Peter"),
             Age = c(34, 28, 29, 54, 34),
             Weight = c(80, 64, 70, 54, 80),
             stringsAsFactors = FALSE)
Person <- as.relation(PersonDF)
## see relation
relation_table(Person)
##  Name   Age Weight
##  Helena 54  54    
##  Sally  28  64    
##  George 29  70    
##  Harry  34  80    
##  Peter  34  80
## projection
relation_table(relation_projection(Person, c("Age", "Weight")))
##  Age Weight
##  54  54    
##  28  64    
##  29  70    
##  34  80
## selection
relation_table(R1 <- relation_selection(Person, Age < 29))
##  Name  Age Weight
##  Sally 28  64
relation_table(R2 <- relation_selection(Person, Age >= 34))
##  Name   Age Weight
##  Helena 54  54    
##  Harry  34  80    
##  Peter  34  80
relation_table(R3 <- relation_selection(Person, Age == Weight))
##  Name   Age Weight
##  Helena 54  54
## union
relation_table(R1 %U% R2)
##  Name   Age Weight
##  Helena 54  54    
##  Sally  28  64    
##  Harry  34  80    
##  Peter  34  80
## works only for the same domains:
relation_table(R2 | R3)
##  Name   Age Weight
##  Helena 54  54    
##  Harry  34  80    
##  Peter  34  80
## complement
relation_table(Person - R2)
##  Name   Age Weight
##  Sally  28  64    
##  George 29  70
## intersection
relation_table(relation_intersection(R2, R3))
##  Name   Age Weight
##  Helena 54  54
## works only for the same domains:
relation_table(R2 & R3)
##  Name   Age Weight
##  Helena 54  54
## symmetric difference
relation_table(relation_symdiff(R2, R3))
##  Name  Age Weight
##  Harry 34  80    
##  Peter 34  80
## cartesian product
Employee <-
  data.frame(Name = c("Harry", "Sally", "George", "Harriet", "John"),
             EmpId = c(3415, 2241, 3401, 2202, 3999),
             DeptName = c("Finance", "Sales", "Finance", "Sales", "N.N."),
             stringsAsFactors = FALSE)

Employee <- as.relation(Employee)
relation_table(Employee)
##  Name    EmpId DeptName
##  George  3401  Finance 
##  Harry   3415  Finance 
##  John    3999  N.N.    
##  Harriet 2202  Sales   
##  Sally   2241  Sales
Dept <- data.frame(DeptName = c("Finance", "Sales", "Production"),
                   Manager = c("George", "Harriet", "Charles"),
                   stringsAsFactors = FALSE)
Dept <- as.relation(Dept)
relation_table(Dept)
##  DeptName   Manager
##  Production Charles
##  Finance    George 
##  Sales      Harriet
relation_table(Employee %><% Dept)
##  Name    EmpId DeptName DeptName   Manager
##  George  3401  Finance  Production Charles
##  Harry   3415  Finance  Production Charles
##  John    3999  N.N.     Production Charles
##  Harriet 2202  Sales    Production Charles
##  Sally   2241  Sales    Production Charles
##  George  3401  Finance  Finance    George 
##  Harry   3415  Finance  Finance    George 
##  John    3999  N.N.     Finance    George 
##  Harriet 2202  Sales    Finance    George 
##  Sally   2241  Sales    Finance    George 
##  George  3401  Finance  Sales      Harriet
##  Harry   3415  Finance  Sales      Harriet
##  John    3999  N.N.     Sales      Harriet
##  Harriet 2202  Sales    Sales      Harriet
##  Sally   2241  Sales    Sales      Harriet
## Natural join
relation_table(Employee %|><|% Dept)
##  Name    EmpId DeptName Manager
##  George  3401  Finance  George 
##  Harry   3415  Finance  George 
##  Harriet 2202  Sales    Harriet
##  Sally   2241  Sales    Harriet
## left (outer) join
relation_table(Employee %=><% Dept)
##  Name    EmpId DeptName Manager
##  George  3401  Finance  George 
##  Harry   3415  Finance  George 
##  Harriet 2202  Sales    Harriet
##  Sally   2241  Sales    Harriet
##  John    3999  N.N.     NA
## right (outer) join
relation_table(Employee %><=% Dept)
##  Name    EmpId DeptName   Manager
##  NA        NA  Production Charles
##  George  3401  Finance    George 
##  Harry   3415  Finance    George 
##  Harriet 2202  Sales      Harriet
##  Sally   2241  Sales      Harriet
## full outer join
relation_table(Employee %=><=% Dept)
##  Name    EmpId DeptName   Manager
##  NA        NA  Production Charles
##  George  3401  Finance    George 
##  Harry   3415  Finance    George 
##  Harriet 2202  Sales      Harriet
##  Sally   2241  Sales      Harriet
##  John    3999  N.N.       NA
## antijoin
relation_table(Employee %|>% Dept)
##  Name EmpId DeptName
##  John 3999  N.N.
relation_table(Employee %<|% Dept)
##  DeptName   Manager
##  Production Charles
## semijoin
relation_table(Employee %|><% Dept)
##  Name    EmpId DeptName
##  George  3401  Finance 
##  Harry   3415  Finance 
##  Harriet 2202  Sales   
##  Sally   2241  Sales
relation_table(Employee %><|% Dept)
##  DeptName Manager
##  Finance  George 
##  Sales    Harriet
## division
Completed <-
  data.frame(Student = c("Fred", "Fred", "Fred", "Eugene",
                         "Eugene", "Sara", "Sara"),
             Task = c("Database1", "Database2", "Compiler1",
                      "Database1", "Compiler1", "Database1",
                      "Database2"),
             stringsAsFactors = FALSE)
Completed <- as.relation(Completed)
relation_table(Completed)
##  Student Task     
##  Eugene  Compiler1
##  Fred    Compiler1
##  Eugene  Database1
##  Fred    Database1
##  Sara    Database1
##  Fred    Database2
##  Sara    Database2
DBProject <- data.frame(Task = c("Database1", "Database2"),
                        stringsAsFactors = FALSE)
DBProject <- as.relation(DBProject)
relation_table(DBProject)
##  Task     
##  Database1
##  Database2
relation_table(Completed %/% DBProject)
##  Student
##  Fred   
##  Sara
## division remainder
relation_table(Completed %% DBProject)
##  Student Task     
##  Eugene  Compiler1
##  Fred    Compiler1
##  Eugene  Database1