SQL Tutorial

Preliminaries
SQL Language
About postgreSQL

Preliminaries

Herein we are using the open-source database postgreSQL (more details below).

All databases and commands to include them in your computer’s postgreSQL are here.

How to access a postgreSQL database using R:

library(RPostgreSQL)
 
drv <- dbDriver("PostgreSQL")
db <- dbConnect(drv, dbname="dvdrental", user="postgres", password="admin")

And to execute a query over the chosen database:

query <- "SELECT
            *
          FROM
            customer;"
 
data <- dbGetQuery(db, query)
head(data)

##   customer_id store_id first_name last_name
## 1         524        1      Jared       Ely
## 2           1        1       Mary     Smith
## 3           2        1   Patricia   Johnson
## 4           3        1      Linda  Williams
## 5           4        2    Barbara     Jones
## 6           5        1  Elizabeth     Brown
##                                 email address_id activebool create_date
## 1        jared.ely@sakilacustomer.org        530       TRUE  2006-02-14
## 2       mary.smith@sakilacustomer.org          5       TRUE  2006-02-14
## 3 patricia.johnson@sakilacustomer.org          6       TRUE  2006-02-14
## 4   linda.williams@sakilacustomer.org          7       TRUE  2006-02-14
## 5    barbara.jones@sakilacustomer.org          8       TRUE  2006-02-14
## 6  elizabeth.brown@sakilacustomer.org          9       TRUE  2006-02-14
##           last_update active
## 1 2013-05-26 14:49:45      1
## 2 2013-05-26 14:49:45      1
## 3 2013-05-26 14:49:45      1
## 4 2013-05-26 14:49:45      1
## 5 2013-05-26 14:49:45      1
## 6 2013-05-26 14:49:45      1

SQL Language

So SQL is a declarative language that specify relational algebra operations over tables (where rows represent records, and columns record attributes) in a relational database. Check wikipedia for more info.

This exercises are from http://sqlschool.modeanalytics.com/

Selection

A query is the SQL way to retrieve information from one or more tables.

This is an eg of projection, ie, selection of just some columns:

query <- "SELECT
            first_name, email
          FROM
            customer;"
 
data <- dbGetQuery(db, query)
head(data)

##   first_name                               email
## 1      Jared        jared.ely@sakilacustomer.org
## 2       Mary       mary.smith@sakilacustomer.org
## 3   Patricia patricia.johnson@sakilacustomer.org
## 4      Linda   linda.williams@sakilacustomer.org
## 5    Barbara    barbara.jones@sakilacustomer.org
## 6  Elizabeth  elizabeth.brown@sakilacustomer.org

Rename the resulting columns:

query <- 'SELECT
            first_name AS "FIRST NAME", 
            email AS "EMAIL"
          FROM
            customer;'
 
data <- dbGetQuery(db, query)
head(data)

##   FIRST NAME                               EMAIL
## 1      Jared        jared.ely@sakilacustomer.org
## 2       Mary       mary.smith@sakilacustomer.org
## 3   Patricia patricia.johnson@sakilacustomer.org
## 4      Linda   linda.williams@sakilacustomer.org
## 5    Barbara    barbara.jones@sakilacustomer.org
## 6  Elizabeth  elizabeth.brown@sakilacustomer.org

Fix a limit on the queried data:

query <- "SELECT
            first_name, email
          FROM
            customer
          LIMIT 3;"
 
data <- dbGetQuery(db, query)
head(data)

##   first_name                               email
## 1      Jared        jared.ely@sakilacustomer.org
## 2       Mary       mary.smith@sakilacustomer.org
## 3   Patricia patricia.johnson@sakilacustomer.org

Another relational algebra operation is a filter:

query <- "SELECT
            first_name, address_id, store_id
          FROM
            customer
          WHERE
            (address_id > 15 AND first_name <= 'm') OR store_id != 1;"
 
data <- dbGetQuery(db, query)
head(data,10)

##    first_name address_id store_id
## 1       Jared        530        1
## 2     Barbara          8        2
## 3    Jennifer         10        2
## 4       Susan         12        2
## 5    Margaret         13        2
## 6        Lisa         15        2
## 7       Karen         17        2
## 8       Betty         18        2
## 9       Helen         19        1
## 10     Sandra         20        2

The logical operators are AND, OR and NOT

To mix two columns:

query <- "SELECT
            customer_id, 
            store_id, 
            1000 * customer_id + store_id AS mix_id
          FROM
            customer;"
 
data <- dbGetQuery(db, query)
head(data)

##   customer_id store_id mix_id
## 1         524        1 524001
## 2           1        1   1001
## 3           2        1   2001
## 4           3        1   3001
## 5           4        2   4002
## 6           5        1   5001

To make sorts:

query <- "SELECT
            first_name, 
            last_name
          FROM
            customer
          ORDER BY
            first_name DESC, last_name;" # first name by decreasing order, then last name by increasing order
 
data <- dbGetQuery(db, query)
head(data,7)

##   first_name   last_name
## 1    Zachary        Hite
## 2     Yvonne     Watkins
## 3    Yolanda      Weaver
## 4      Wilma    Richards
## 5     Willie      Howell
## 6     Willie     Markham
## 7    William Satterfield

Using (simple) regular expressions:

query <- "SELECT
            first_name, 
            last_name
          FROM
            customer
          WHERE
            first_name LIKE 'W%a' OR last_name LIKE 'J_n%';"   # % means 0+ chars, _ means 1 char
 
data <- dbGetQuery(db, query)
head(data)

##   first_name last_name
## 1    Barbara     Jones
## 2     Louise   Jenkins
## 3      Wanda Patterson
## 4      Wilma  Richards
## 5       Lena    Jensen
## 6      Naomi  Jennings

ILIKE is for case insensitive.

IN is used to compare with a set of values:

query <- "SELECT
            first_name,    -- btw, this is a SQL comment
            last_name
          FROM
            customer
          /* 
                this is a multiline comment
           */
          WHERE
            first_name IN ('Mary', 'Barbara');" 
 
data <- dbGetQuery(db, query)
head(data)

##   first_name last_name
## 1       Mary     Smith
## 2    Barbara     Jones

The next egs use this google stock csv. To include it in postgreSQL do:

CREATE DATABASE stocks;
\c stocks;
CREATE TABLE stocks ( DATE_STOCK varchar(8), YEAR integer, MONTH integer, OPEN double precision, HIGH double precision, LOW double precision, CLOSE double precision, VOLUME integer, id integer PRIMARY KEY );
\COPY stocks FROM 'C:\Users\jpn\Downloads\stocks.csv' DELIMITER ',' CSV HEADER; (adjust to your local dir)

Now we can access it here:

db2 <- dbConnect(drv, dbname="stocks", user="postgres", password="admin")

query <- "SELECT
            *
          FROM
            stocks;" 
 
data <- dbGetQuery(db2, query)
head(data,10)

##    date_stock year month   open   high    low  close   volume id
## 1     1/30/14 2014     1 502.54 506.50 496.70 499.78 24182996  1
## 2     1/29/14 2014     1 503.95 507.37 498.62 500.75 17991828  2
## 3     1/28/14 2014     1 508.76 515.00 502.07 506.50 38119083  3
## 4     1/27/14 2014     1 550.07 554.80 545.75 550.50 20602736  4
## 5     1/24/14 2014     1 554.00 555.62 544.75 546.07 15483491  5
## 6     1/23/14 2014     1 549.94 556.50 544.81 556.18 14425478  6
## 7     1/22/14 2014     1 550.91 557.29 547.81 551.51 13602762  7
## 8     1/21/14 2014     1 540.99 550.07 540.42 549.07 11750792  8
## 9     1/17/14 2014     1 551.48 552.07 539.90 540.67 15489527  9
## 10    1/16/14 2014     1 554.90 556.85 551.68 554.25  8210190 10

We can aggregate information into some available functions:

query <- "SELECT
            COUNT(*),                 -- count the number of records
            COUNT(high) AS highs,     -- count the number of not null values of column 'high'
            SUM(high)   AS sum_highs, -- null values are treated as zero
            MIN(volume) AS min_volume,
            MAX(volume) AS max_volume,
            AVG(low)    AS avg_low    -- ignore nulls
          FROM
            stocks;" 
 
data <- dbGetQuery(db2, query)
head(data,10)

##   count highs sum_highs min_volume max_volume avg_low
## 1   100    99  52176.27    5984105   38119083 518.493

We can mix the aggregators with grouping the table into values:

query <- "SELECT
            year,
            COUNT(*),                 -- count the number of records
            COUNT(high) AS highs,     -- count the number of not null values of column 'high'
            SUM(high)   AS sum_highs, -- null values are treated as zero
            MIN(volume) AS min_volume,
            MAX(volume) AS max_volume,
            AVG(low)    AS avg_low    -- ignore nulls
          FROM
            stocks
          GROUP BY
            year;" 
 
data <- dbGetQuery(db2, query)
head(data,10)

##   year count highs sum_highs min_volume max_volume  avg_low
## 1 2013    80    80  41824.02    5984105   31932384 514.3559
## 2 2014    20    19  10352.25    8210190   38119083 535.0415

Groups can be nested:

query <- "SELECT
            year, month,
            COUNT(*),                 -- count the number of records
            COUNT(high) AS highs,     -- count the number of not null values of column 'high'
            SUM(high)   AS sum_highs, -- null values are treated as zero
            MIN(volume) AS min_volume,
            MAX(volume) AS max_volume,
            AVG(low)    AS avg_low    -- ignore nulls
          FROM
            stocks
          GROUP BY
            year, month
          ORDER BY
            year, month;" 
 
data <- dbGetQuery(db2, query)
head(data,10)

##   year month count highs sum_highs min_volume max_volume  avg_low
## 1 2013     9    16    16   7715.25    8144247   31932384 473.1256
## 2 2013    10    23    23  11700.72    8967859   22707445 499.5426
## 3 2013    11    20    20  10556.59    6935114   14335104 520.2230
## 4 2013    12    21    21  11851.46    5984105   20209401 556.4057
## 5 2014     1    20    19  10352.25    8210190   38119083 535.0415

Distinct values can be selected:

query <- "SELECT DISTINCT
            month
          FROM
            stocks;" 
 
data <- dbGetQuery(db2, query)
head(data,10)

##   month
## 1    11
## 2    12
## 3     1
## 4     9
## 5    10

With more than one column, SQL returns all the distinct tuples it finds:

query <- "SELECT DISTINCT
            month, year
          FROM
            stocks;" 
 
data <- dbGetQuery(db2, query)
head(data)

##   month year
## 1     9 2013
## 2    10 2013
## 3    12 2013
## 4     1 2014
## 5    11 2013

We can use it with count

query <- "SELECT 
            COUNT(DISTINCT month) -- how many distinct months?
          FROM
            stocks;" 
 
data <- dbGetQuery(db2, query)
head(data)

##   count
## 1     5

For the next egs let’s use this csv.

db3 <- dbConnect(drv, dbname="rosters", user="postgres", password="admin")

query <- "SELECT
            *
          FROM
            players;" 
 
data <- dbGetQuery(db3, query)
head(data,10)

##       school_full_name school_name            name position height weight
## 1  Cincinnati Bearcats  Cincinnati Ralph Abernathy       RB     67    161
## 2  Cincinnati Bearcats  Cincinnati    Mekale McKay       WR     78    195
## 3  Cincinnati Bearcats  Cincinnati     Trenier Orr       CB     71    177
## 4  Cincinnati Bearcats  Cincinnati    Bennie Coney       QB     75    216
## 5  Cincinnati Bearcats  Cincinnati   Johnny Holton       WR     75    190
## 6  Cincinnati Bearcats  Cincinnati   Howard Wilder       DB     71    180
## 7  Cincinnati Bearcats  Cincinnati  Munchie Legaux       QB     77    200
## 8  Cincinnati Bearcats  Cincinnati       Mark Barr       WR     73    163
## 9  Cincinnati Bearcats  Cincinnati     Aaron Brown       CB     71    172
## 10 Cincinnati Bearcats  Cincinnati Anthony McClung       WR     73    177
##    year           home_town state id
## 1    JR         ATLANTA, GA    GA  1
## 2    SO      LOUISVILLE, KY    KY  2
## 3    SO   WINTER GARDEN, FL    FL  3
## 4    FR      PLANT CITY, FL    FL  4
## 5    JR           MIAMI, FL    FL  5
## 6    JR      SEA ISLAND, GA    GA  6
## 7    SR     NEW ORLEANS, LA    LA  7
## 8    FR FORT LAUDERDALE, FL    FL  8
## 9    FR           MIAMI, FL    FL  9
## 10   SR    INDIANAPOLIS, IN    IN 10

To deal with conditional statements, SQL uses CASE WHEN ... THEN ... ELSE ... END

query <- "SELECT
            name,
            year,
            CASE WHEN year = 'SR'   -- create a new column where only senior players have a 'yes' values
              THEN 'yes'
              ELSE NULL 
            END AS is_a_senior
          FROM
            players;" 
 
data <- dbGetQuery(db3, query)
head(data,10)

##               name year is_a_senior
## 1  Ralph Abernathy   JR        <NA>
## 2     Mekale McKay   SO        <NA>
## 3      Trenier Orr   SO        <NA>
## 4     Bennie Coney   FR        <NA>
## 5    Johnny Holton   JR        <NA>
## 6    Howard Wilder   JR        <NA>
## 7   Munchie Legaux   SR         yes
## 8        Mark Barr   FR        <NA>
## 9      Aaron Brown   FR        <NA>
## 10 Anthony McClung   SR         yes

It’s possible to have a sequence of conditionals:

query <- "SELECT
            name,
            weight,
            CASE WHEN weight > 250 THEN 'over 250'
                 WHEN weight > 200 THEN '200-250'
                 WHEN weight > 175 THEN '175-200'
                                   ELSE 'under 175' 
            END AS weight_group
          FROM
            players;" 
 
data <- dbGetQuery(db3, query)
head(data,10)

##               name weight weight_group
## 1  Ralph Abernathy    161    under 175
## 2     Mekale McKay    195      175-200
## 3      Trenier Orr    177      175-200
## 4     Bennie Coney    216      200-250
## 5    Johnny Holton    190      175-200
## 6    Howard Wilder    180      175-200
## 7   Munchie Legaux    200      175-200
## 8        Mark Barr    163    under 175
## 9      Aaron Brown    172    under 175
## 10 Anthony McClung    177      175-200

Conditional statements can be mixed with aggregatorsto achieve more expression power:

query <- "SELECT
            CASE WHEN year = 'FR' 
              THEN 'FR'
              ELSE 'Not FR' 
            END      AS year_group,
            COUNT(1) AS count    -- 1 means the first column of the select, aka year_group
          FROM
            players
          GROUP BY 1;" 
 
data <- dbGetQuery(db3, query)
head(data)

##   year_group count
## 1     Not FR  3018
## 2         FR  1982

Another eg:

query <- "SELECT
            CASE WHEN year = 'FR' THEN 'FR'
                 WHEN year = 'SO' THEN 'SO'
                 WHEN year = 'JR' THEN 'JR'
                 WHEN year = 'SR' THEN 'SR'
                                  ELSE 'No Year Data' 
            END      AS year_group,
            COUNT(1) AS count
          FROM
            players
          GROUP BY 1;" 
 
data <- dbGetQuery(db3, query)
head(data)

##   year_group count
## 1         JR  1086
## 2         FR  1982
## 3         SR   846
## 4         SO  1086

The previous eg now rotated, aka, pivoting:

query <- "SELECT
            COUNT(CASE WHEN year = 'JR' THEN 1 ELSE NULL END) AS jr_count,
            COUNT(CASE WHEN year = 'FR' THEN 1 ELSE NULL END) AS fr_count,
            COUNT(CASE WHEN year = 'SR' THEN 1 ELSE NULL END) AS sr_count,
            COUNT(CASE WHEN year = 'SO' THEN 1 ELSE NULL END) AS so_count
          FROM
            players;" 
 
data <- dbGetQuery(db3, query)
head(data)

##   jr_count fr_count sr_count so_count
## 1     1086     1982      846     1086

Joins

Joins permit that two or more tables can be associated to recover common associated information.

In the next egs we use a 2nd table names teams from the same database (csv).

query <- "SELECT
            *
          FROM
            teams;" 
 
data <- dbGetQuery(db3, query)
head(data)

##                   division        conference school_name
## 1 FBS (Division I-A Teams) American Athletic  Cincinnati
## 2 FBS (Division I-A Teams) American Athletic Connecticut
## 3 FBS (Division I-A Teams) American Athletic     Houston
## 4 FBS (Division I-A Teams) American Athletic  Louisville
## 5 FBS (Division I-A Teams) American Athletic     Memphis
## 6 FBS (Division I-A Teams) American Athletic     Rutgers
##                                        roster_url id
## 1 http://espn.go.com/ncf/teams/roster?teamId=2132  1
## 2   http://espn.go.com/ncf/teams/roster?teamId=41  2
## 3  http://espn.go.com/ncf/teams/roster?teamId=248  3
## 4   http://espn.go.com/ncf/teams/roster?teamId=97  4
## 5  http://espn.go.com/ncf/teams/roster?teamId=235  5
## 6  http://espn.go.com/ncf/teams/roster?teamId=164  6

Say we want to know which conference (in teams table) has the highest average weight (in players table):

query <- "SELECT
            *                   -- this is the full table returned by join, not the solution
          FROM
            players JOIN teams  -- this is the new table, the joined table, where select is going to operate
            ON players.school_name = teams.school_name;" 
 
data <- dbGetQuery(db3, query)
head(data)

##      school_full_name school_name            name position height weight
## 1 Cincinnati Bearcats  Cincinnati Ralph Abernathy       RB     67    161
## 2 Cincinnati Bearcats  Cincinnati    Mekale McKay       WR     78    195
## 3 Cincinnati Bearcats  Cincinnati     Trenier Orr       CB     71    177
## 4 Cincinnati Bearcats  Cincinnati    Bennie Coney       QB     75    216
## 5 Cincinnati Bearcats  Cincinnati   Johnny Holton       WR     75    190
## 6 Cincinnati Bearcats  Cincinnati   Howard Wilder       DB     71    180
##   year         home_town state id                 division
## 1   JR       ATLANTA, GA    GA  1 FBS (Division I-A Teams)
## 2   SO    LOUISVILLE, KY    KY  2 FBS (Division I-A Teams)
## 3   SO WINTER GARDEN, FL    FL  3 FBS (Division I-A Teams)
## 4   FR    PLANT CITY, FL    FL  4 FBS (Division I-A Teams)
## 5   JR         MIAMI, FL    FL  5 FBS (Division I-A Teams)
## 6   JR    SEA ISLAND, GA    GA  6 FBS (Division I-A Teams)
##          conference school_name
## 1 American Athletic  Cincinnati
## 2 American Athletic  Cincinnati
## 3 American Athletic  Cincinnati
## 4 American Athletic  Cincinnati
## 5 American Athletic  Cincinnati
## 6 American Athletic  Cincinnati
##                                        roster_url id
## 1 http://espn.go.com/ncf/teams/roster?teamId=2132  1
## 2 http://espn.go.com/ncf/teams/roster?teamId=2132  1
## 3 http://espn.go.com/ncf/teams/roster?teamId=2132  1
## 4 http://espn.go.com/ncf/teams/roster?teamId=2132  1
## 5 http://espn.go.com/ncf/teams/roster?teamId=2132  1
## 6 http://espn.go.com/ncf/teams/roster?teamId=2132  1

query <- "SELECT
            teams.conference    AS conference,
            AVG(players.weight) AS average_weight
          FROM
            players JOIN teams
            ON teams.school_name = players.school_name
          GROUP BY teams.conference
          ORDER BY AVG(players.weight) DESC;" 
 
data <- dbGetQuery(db3, query)
head(data)

##          conference average_weight
## 1               ACC       228.9955
## 2           Big Ten       228.1336
## 3 American Athletic       227.9795
## 4            Big 12       226.4017

This JOIN is an inner join, ie, it will only show rows that satisfy the ON condition. It is an intersection of the two tables (check outer joins below).

If there are columns in different tables with the same names but different information, we should use AS to give them distinct names.

Outer Join

These are are tables for the next egs:

db4 <- dbConnect(drv, dbname="crunchbase", user="postgres", password="admin")

query <- "SELECT
            *                
          FROM
            companies;" 
 
data <- dbGetQuery(db4, query)
head(data)

##                      permalink          name                  homepage
## 1                /company/8868          8868        http://www.8868.cn
## 2                /company/21e6      2.10E+07                      <NA>
## 3        /company/club-domains .Club Domains        http://dotclub.com
## 4        /company/fox-networks .Fox Networks     http://www.dotfox.com
## 5        /company/a-list-games [a]list games http://www.alistgames.com
## 6 /company/pay-mobile-checkout          @Pay          http://atpay.com
##   category_code funding_total_usd    status country_code state_code
## 1          <NA>                NA operating         <NA>       <NA>
## 2          <NA>           5050000 operating          USA         CA
## 3      software           7000000 operating          USA         FL
## 4   advertising           4912394    closed          ARG       <NA>
## 5   games_video           9300000 operating         <NA>       <NA>
## 6        mobile           3500000 operating          USA         NM
##            region          city funding_rounds id
## 1         unknown          <NA>              1  1
## 2          SF Bay San Francisco              1  2
## 3 Fort Lauderdale  Oakland Park              1  3
## 4    Buenos Aires  Buenos Aires              1  4
## 5         unknown          <NA>              1  5
## 6     Albuquerque   Albuquerque              1  6

nrow(data)

## [1] 27325

query <- "SELECT
            *                
          FROM
            acquisitions;" 
 
data <- dbGetQuery(db4, query)
head(data)

##                                      company_permalink
## 1                                     /company/waywire
## 2                         /company/1-nation-technology
## 3 /company/1-stop-financial-service-centers-of-america
## 4                            /company/1-800-contacts-2
## 5                                /company/1000memories
## 6                                      /company/10best
##                                  company_name company_category_code
## 1                                    #waywire                  news
## 2                         1 Nation Technology                  <NA>
## 3 1 Stop Financial Service Centers of America                  <NA>
## 4                              1-800 Contacts                  <NA>
## 5                                1000memories                   web
## 6                                      10best                   web
##   company_country_code company_state_code company_region  company_city
## 1                  USA                 NY       New York      New York
## 2                 <NA>               <NA>        unknown          <NA>
## 3                  USA                 TX         Austin    Round Rock
## 4                 <NA>               <NA>        unknown          <NA>
## 5                  USA                 CA         SF Bay San Francisco
## 6                  USA                 SC     Greenville    Greenville
##               acquirer_permalink          acquirer_name
## 1               /company/magnify                Magnify
## 2                /company/vology                 Vology
## 3        /company/confie-seguros         Confie Seguros
## 4 /company/thomas-h-lee-partners Thomas H. Lee Partners
## 5          /company/ancestry-com               Ancestry
## 6            /company/nile-guide              NileGuide
##   acquirer_category_code acquirer_country_code acquirer_state_code
## 1            games_video                   USA                  NY
## 2                  other                  <NA>                <NA>
## 3             enterprise                   USA                  CA
## 4                   <NA>                   USA                  MA
## 5              ecommerce                   USA                  UT
## 6                 social                   USA                  CA
##   acquirer_region acquirer_city price_amount id
## 1        New York      New York           NA  1
## 2         unknown          <NA>           NA  2
## 3     Los Angeles    Buena Park           NA  3
## 4          Boston        Boston           NA  4
## 5  Salt Lake City         Provo           NA  5
## 6          SF Bay San Francisco           NA  6

nrow(data)

## [1] 5000

There are three types of Outer Join: + Left Outer Join, which includes unmatched rows from the left table + Right Outer Join, which includes unmatched rows from the right table (ie, T2 LEFT JOIN T1 produces the same results as T1 RIGHT JOIN T2) + Full Outer Join, which includes unmatched rows from both tables

First the inner join:

query <- "SELECT
            companies.permalink            AS companies_permalink,
            companies.name                 AS companies_name,
            acquisitions.company_permalink AS acquisitions_permalink
          FROM companies JOIN acquisitions
               ON companies.permalink = acquisitions.company_permalink;" 
 
data <- dbGetQuery(db4, query)
head(data)

##          companies_permalink    companies_name     acquisitions_permalink
## 1           /company/waywire          #waywire           /company/waywire
## 2      /company/1000memories      1000memories      /company/1000memories
## 3         /company/12society         12Society         /company/12society
## 4         /company/280-north         280 North         /company/280-north
## 5         /company/280-north         280 North         /company/280-north
## 6 /company/2web-technologies 2Web Technologies /company/2web-technologies

Now a left join which includes several rows from companies table that does not have an entry at the acquisitions table:

query <- "SELECT
            companies.permalink            AS companies_permalink,
            companies.name                 AS companies_name,
            acquisitions.company_permalink AS acquisitions_permalink
          FROM companies LEFT JOIN acquisitions
               ON companies.permalink = acquisitions.company_permalink;" 
 
data <- dbGetQuery(db4, query)
head(data)

##          companies_permalink    companies_name     acquisitions_permalink
## 1           /company/waywire          #waywire           /company/waywire
## 2      /company/1000memories      1000memories      /company/1000memories
## 3         /company/12society         12Society         /company/12society
## 4         /company/280-north         280 North         /company/280-north
## 5         /company/280-north         280 North         /company/280-north
## 6 /company/2web-technologies 2Web Technologies /company/2web-technologies

The next eg count the number of unique companies (not double-counting companies) and unique acquired companies by state. It does not include results for which there is no state data, and order by the number of acquired companies from highest to lowest.

query <- "SELECT companies.state_code,
                 COUNT(DISTINCT companies.permalink)            AS unique_companies,
                 COUNT(DISTINCT acquisitions.company_permalink) AS unique_companies_acquired
          FROM companies LEFT JOIN acquisitions
              ON companies.permalink = acquisitions.company_permalink
          WHERE companies.state_code IS NOT NULL
          GROUP BY 1
          ORDER BY 3 DESC;" 
 
data <- dbGetQuery(db4, query)
head(data)

##   state_code unique_companies unique_companies_acquired
## 1         CA             6170                       415
## 2         NY             1730                        74
## 3         MA             1272                        68
## 4         WA              638                        41
## 5         TX              808                        40
## 6         CO              453                        19

If you want to provide a default value for the missing attributes, check function COALESCE.

Converting Data

Functions LEFT and RIGHT allow to snip \(n\) chars from a string. LENGTH returns the size of the string.

query <- "SELECT
            company_permalink,
            LEFT(company_permalink, 6)  AS left_permalink,
            RIGHT(company_permalink, 6) AS right_permalink,
            LENGTH(company_permalink) AS length
          FROM
            acquisitions;" 
 
data <- dbGetQuery(db4, query)
head(data)

##                                      company_permalink left_permalink
## 1                                     /company/waywire         /compa
## 2                         /company/1-nation-technology         /compa
## 3 /company/1-stop-financial-service-centers-of-america         /compa
## 4                            /company/1-800-contacts-2         /compa
## 5                                /company/1000memories         /compa
## 6                                      /company/10best         /compa
##   right_permalink length
## 1          aywire     16
## 2          nology     28
## 3          merica     52
## 4          acts-2     25
## 5          mories     21
## 6          10best     15

TRIM allows to remove certain chars from the extremeties of the string. It has 3 args: first from where we should start, then a string of char to remove, then a FROM followed by which column to process:

query <- "SELECT
            company_permalink,
            TRIM(leading '/e' FROM company_permalink) AS trim_begin,
            TRIM(trailing '/e' FROM company_permalink) AS trim_end,
            TRIM(both '/e' FROM company_permalink) AS trim_begin_and_end
          FROM
            acquisitions;" 
 
data <- dbGetQuery(db4, query)
head(data)

##                                      company_permalink
## 1                                     /company/waywire
## 2                         /company/1-nation-technology
## 3 /company/1-stop-financial-service-centers-of-america
## 4                            /company/1-800-contacts-2
## 5                                /company/1000memories
## 6                                      /company/10best
##                                            trim_begin
## 1                                     company/waywire
## 2                         company/1-nation-technology
## 3 company/1-stop-financial-service-centers-of-america
## 4                            company/1-800-contacts-2
## 5                                company/1000memories
## 6                                      company/10best
##                                               trim_end
## 1                                      /company/waywir
## 2                         /company/1-nation-technology
## 3 /company/1-stop-financial-service-centers-of-america
## 4                            /company/1-800-contacts-2
## 5                                /company/1000memories
## 6                                      /company/10best
##                                    trim_begin_and_end
## 1                                      company/waywir
## 2                         company/1-nation-technology
## 3 company/1-stop-financial-service-centers-of-america
## 4                            company/1-800-contacts-2
## 5                                company/1000memories
## 6                                      company/10best

Other functions: + POSITION, STRPOS, to find chars within a string + SUBSTR, LEFT, RIGHT, to produce substrings + CONCAT, ||, to concatenate + UPPER, LOWER, to make uppercase/lowercase string

For date manipulation let’s recheck the second db:

query <- "SELECT
            date_stock, id
          FROM
            stocks;"
 
data <- dbGetQuery(db2, query)
head(data,20)

##    date_stock id
## 1     1/30/14  1
## 2     1/29/14  2
## 3     1/28/14  3
## 4     1/27/14  4
## 5     1/24/14  5
## 6     1/23/14  6
## 7     1/22/14  7
## 8     1/21/14  8
## 9     1/17/14  9
## 10    1/16/14 10
## 11    1/15/14 11
## 12    1/14/14 12
## 13    1/13/14 13
## 14    1/10/14 14
## 15     1/9/14 15
## 16     1/8/14 16
## 17     1/7/14 17
## 18     1/6/14 18
## 19     1/3/14 19
## 20     1/2/14 20

The date_stock column is a string with some quirks in it (different sizes for different dates). How to convert it to date?

query <- "SELECT
            id, 
            date_stock,
            CASE WHEN LENGTH(date_stock) = 6                                  -- months 1 to 9, days 1 to 9
                    THEN ('20' || RIGHT(date_stock, 2) || '-' || 
                                  LEFT(date_stock, 1) || '-' || 
                                  SUBSTR(date_stock, 3, 1))::date 
                 WHEN LENGTH(date_stock) = 8                                  -- months > 9, days > 9
                    THEN ('20' || RIGHT(date_stock, 2) || '-' || 
                                  LEFT(date_stock, 2) || '-' || 
                                  SUBSTR(date_stock, 4, 2))::date 
                 WHEN LENGTH(date_stock) = 7 AND SUBSTR(date_stock,2,1) = '/' -- months < 10, days > 9
                    THEN ('20' || RIGHT(date_stock, 2) || '-' || 
                                  LEFT(date_stock, 1) || '-' || 
                                  SUBSTR(date_stock, 3, 2))::date 
                    ELSE ('20' || RIGHT(date_stock, 2) || '-' ||              -- months > 9, days < 10
                                  LEFT(date_stock, 2) || '-' || 
                                  SUBSTR(date_stock, 4, 2))::date   -- notice the data cast (à lá Haskell)
            END AS clean_date
          FROM
            stocks;"
 
data <- dbGetQuery(db2, query)

## Warning in postgresqlQuickSQL(conn, statement, ...): Could not create executeSELECT
##             id, 
##             date_stock,
##             CASE WHEN LENGTH(date_stock) = 6                                  -- months 1 to 9, days 1 to 9
##                     THEN ('20' || RIGHT(date_stock, 2) || '-' || 
##                                   LEFT(date_stock, 1) || '-' || 
##                                   SUBSTR(date_stock, 3, 1))::date 
##                  WHEN LENGTH(date_stock) = 8                                  -- months > 9, days > 9
##                     THEN ('20' || RIGHT(date_stock, 2) || '-' || 
##                                   LEFT(date_stock, 2) || '-' || 
##                                   SUBSTR(date_stock, 4, 2))::date 
##                  WHEN LENGTH(date_stock) = 7 AND SUBSTR(date_stock,2,1) = '/' -- months < 10, days > 9
##                     THEN ('20' || RIGHT(date_stock, 2) || '-' || 
##                                   LEFT(date_stock, 1) || '-' || 
##                                   SUBSTR(date_stock, 3, 2))::date 
##                     ELSE ('20' || RIGHT(date_stock, 2) || '-' ||              -- months > 9, days < 10
##                                   LEFT(date_stock, 2) || '-' || 
##                                   SUBSTR(date_stock, 4, 2))::date   -- notice the data cast (à lá Haskell)
##             END AS clean_date
##           FROM
##             stocks;

head(data, 20)

## NULL

The function NOW() provides the current time (as a timestamp datatype):

query <- "SELECT
            date_stock, id, NOW() AS current_time
          FROM
            stocks;"
 
data <- dbGetQuery(db2, query)
head(data)

##   date_stock id        current_time
## 1    1/30/14  1 2015-03-01 09:56:50
## 2    1/29/14  2 2015-03-01 09:56:50
## 3    1/28/14  3 2015-03-01 09:56:50
## 4    1/27/14  4 2015-03-01 09:56:50
## 5    1/24/14  5 2015-03-01 09:56:50
## 6    1/23/14  6 2015-03-01 09:56:50

Timestamps have specific function to retrive their composite information:

query <- "SELECT
            EXTRACT('year' FROM temp_table.current_time) AS current_year
          FROM
            ( SELECT    -- make a sub_query, in order to get the timestamp for the eg
                date_stock, id, NOW() AS current_time 
              FROM stocks
            ) AS temp_table;"
 
data <- dbGetQuery(db2, query)
head(data)

##   current_year
## 1         2015
## 2         2015
## 3         2015
## 4         2015
## 5         2015
## 6         2015

Subqueries

The previous eg has a subquery, the construction of a temporary (named) table needed to get the desired result.

Here’s another eg that shows the rows with the smaller date in the table (the result is not ok, since, as we seen, the dates do not follow a standard description, but let’s assume that was not a problem here). In this case the temporary table was made in the WHERE clause:

query <- "SELECT
            date_stock, id
          FROM
            stocks
          WHERE
            date_stock = ( SELECT min(date_stock) FROM stocks );"
 
data <- dbGetQuery(db2, query)
head(data)

##   date_stock id
## 1    1/10/14 14

We can also have 2+ subqueries and then join them:

query <- "SELECT
            first_name, last_name
          FROM
            customer;"
 
data <- dbGetQuery(db, query)
head(data)

##   first_name last_name
## 1      Jared       Ely
## 2       Mary     Smith
## 3   Patricia   Johnson
## 4      Linda  Williams
## 5    Barbara     Jones
## 6  Elizabeth     Brown

# Show customers names that have last names started by 'An' or 'Ar'

query <- "SELECT
            temp.first_name, temp.last_name
          FROM
          ( SELECT first_name, last_name FROM customer WHERE last_name LIKE 'An%' 
            UNION ALL
            SELECT first_name, last_name FROM customer WHERE last_name LIKE 'Ar%' 
          ) AS temp;"
 
data <- dbGetQuery(db, query)
head(data,20)

##   first_name last_name
## 1       Lisa  Anderson
## 2        Ida   Andrews
## 3       Jose    Andrew
## 4   Beatrice    Arnold
## 5    Melanie Armstrong
## 6       Carl     Artis
## 7      Harry      Arce
## 8     Jordan Archuleta
## 9       Kent Arsenault

## [1] TRUE

## [1] TRUE

## [1] TRUE

## [1] TRUE

About postgreSQL

To run postgreSQL in admin mode: psql -Upostgres
To create a new database: CREATE DATABASE dvdrental;
To add a database into dvdrental database (at shell): pg_restore -U postgres -d dvdrental dvdrental.tar

More info at http://www.postgresqltutorial.com/

Helpful postgreSQL commands (ref):

change to postgres user and open psql prompt: sudo -u postgres psql postgres
list databases: postgres=# \l
list roles: postgres=# \du
create role: postgres=#CREATE ROLE demorole1 WITH LOGIN ENCRYPTED PASSWORD 'password1' CREATEDB;
alter role: postgres=#ALTER ROLE demorole1 CREATEROLE CREATEDB REPLICATION SUPERUSER;
drop role: postgres=#DROP ROLE demorole1;
create database: postgres=#CREATE DATABASE demodb1 WITH OWNER demorole1 ENCODING 'UTF8';
grant privileges to new user: GRANT ALL PRIVILEGES ON DATABASE demodb1 TO demorole1;
drop database: postgres=#DROP DATABASE demodb1;
connect to database: \c <databasename>
list tables in connected database: \dt
list columns on table: \d <tablename>
backup database: pg_dump <databasename> > <outfile>
quit: \q
help commands: \?
help command: \h COMMAND
execute commands from file: \i FILE
execute command in shell: \! COMMAND

Operations besides select using RPostgreSQL

library(RPostgreSQL)
 
drv <- dbDriver("PostgreSQL")
# before this, need to CREATE DATABASE tests;
db <- dbConnect(drv, dbname="tests", user="postgres", password="admin") 

# delete if exists
if (dbExistsTable(db, "books"))
  dbSendQuery(db, "DROP TABLE books;") # or  dbRemoveTable("books");

## <PostgreSQLResult:(3688,4,3)>

# create table
dbSendQuery(db, "CREATE TABLE books (intcolumn integer, floatcolumn float);")

## <PostgreSQLResult:(3688,4,4)>

# insert data, row by row
dbSendQuery(db, "INSERT INTO books VALUES(12, 19.95);")

## <PostgreSQLResult:(3688,4,5)>

dbSendQuery(db, "INSERT INTO books VALUES( 1,  9.95);")

## <PostgreSQLResult:(3688,4,6)>

dbSendQuery(db, "INSERT INTO books VALUES(10, 14.95);")

## <PostgreSQLResult:(3688,4,7)>

dbSendQuery(db, "INSERT INTO books VALUES( 2,  4.95);")

## <PostgreSQLResult:(3688,4,8)>

dbSendQuery(db, "INSERT INTO books VALUES( 3,  1.45);")

## <PostgreSQLResult:(3688,4,9)>

# get contents
dbGetQuery(db, "SELECT * FROM books;")

##   intcolumn floatcolumn
## 1        12       19.95
## 2         1        9.95
## 3        10       14.95
## 4         2        4.95
## 5         3        1.45

# get contents (lazy)
data <- dbSendQuery(db, "SELECT * FROM books;")
fetch(data, 3)       # get the next 2 records (use -1 to fetch all)

##   intcolumn floatcolumn
## 1        12       19.95
## 2         1        9.95
## 3        10       14.95

dbGetRowCount(data)  # how many rows were processed so far

## [1] 3

fetch(data, 2)

##   intcolumn floatcolumn
## 4         2        4.95
## 5         3        1.45

dbGetRowCount(data)

## [1] 5

# get the DBMS statement of a given resulst
dbGetStatement(data)

## [1] "SELECT * FROM books;"

dbGetRowsAffected(data) # how many rows were affected (-1 if none was)

## [1] -1

# get avalailable tables in database
dbListTables(db)

## [1] "books"

# get list of table's column names
dbListFields(db, "books")

## [1] "intcolumn"   "floatcolumn"

# import table into data frame
df <- dbReadTable(db, "books")
df

##   intcolumn floatcolumn
## 1        12       19.95
## 2         1        9.95
## 3        10       14.95
## 4         2        4.95
## 5         3        1.45

# export data frame into db table
df2 <- data.frame(id=1:5, x=runif(10), y=sample(letters,10))
df2

##    id          x y
## 1   1 0.09060102 v
## 2   2 0.73327937 i
## 3   3 0.99108526 p
## 4   4 0.41674766 o
## 5   5 0.37172027 b
## 6   1 0.30767700 q
## 7   2 0.47351955 u
## 8   3 0.26539024 c
## 9   4 0.03744036 x
## 10  5 0.90293034 m

dbWriteTable(db, "new_table", df2)

## [1] TRUE

dbReadTable(db, "new_table")

##    id          x y
## 1   1 0.09060102 v
## 2   2 0.73327937 i
## 3   3 0.99108526 p
## 4   4 0.41674766 o
## 5   5 0.37172027 b
## 6   1 0.30767700 q
## 7   2 0.47351955 u
## 8   3 0.26539024 c
## 9   4 0.03744036 x
## 10  5 0.90293034 m

dbClearResult(data) # flushes any pending data and frees the resources used by result set

## [1] TRUE

# postgreSQL transactions
dbSendQuery(db, "BEGIN;")            # begin transaction

## <PostgreSQLResult:(3688,4,25)>

dbRemoveTable(db,"new_table")

## [1] TRUE

dbExistsTable(db,"new_table")

## [1] FALSE

dbRollback(db)                       # rollback, ie, undo all changes since last begin

## [1] TRUE

dbExistsTable(db,"new_table")

## [1] TRUE

dbSendQuery(db, "BEGIN;")

## <PostgreSQLResult:(3688,4,34)>

dbRemoveTable(db,"new_table")

## [1] TRUE

dbExistsTable(db,"new_table")

## [1] FALSE

dbCommit(db)                         # or dbSendQuery(db, "COMMIT;")

## [1] TRUE

dbExistsTable(db,"new_table")

## [1] FALSE

dbDisconnect(db)    # disconnects with the database

## [1] TRUE

dbUnloadDriver(drv) # unload driver (frees all resources)

## [1] TRUE