41  Joins


Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

dplyr includes the following commands for performing table joins:

Using the same example data as in the base R example (Chapter 27):

a <- tibble(
  PID = 101:109,
  Hospital = c("UCSF", "HUP", "Stanford",
                "Stanford", "UCSF", "HUP", 
                "HUP", "Stanford", "UCSF"),
  Age = c(22, 34, 41, 19, 53, 21, 63, 22, 19),
  Sex = c(1, 1, 0, 1, 0, 0, 1, 0, 0))
a
# A tibble: 9 × 4
    PID Hospital   Age   Sex
  <int> <chr>    <dbl> <dbl>
1   101 UCSF        22     1
2   102 HUP         34     1
3   103 Stanford    41     0
4   104 Stanford    19     1
5   105 UCSF        53     0
6   106 HUP         21     0
7   107 HUP         63     1
8   108 Stanford    22     0
9   109 UCSF        19     0
b  <- tibble(
  PID = 106:112,
  V1 = c(153, 89, 112, 228,  91, 190, 101),
  Department = c("Neurology", "Radiology",
                "Emergency", "Cardiology",
                "Surgery", "Neurology", "Psychiatry")
)
b
# A tibble: 7 × 3
    PID    V1 Department
  <int> <dbl> <chr>     
1   106   153 Neurology 
2   107    89 Radiology 
3   108   112 Emergency 
4   109   228 Cardiology
5   110    91 Surgery   
6   111   190 Neurology 
7   112   101 Psychiatry

41.1 Inner join

ab_inner <- inner_join(a, b)
Joining with `by = join_by(PID)`
ab_inner
# A tibble: 4 × 6
    PID Hospital   Age   Sex    V1 Department
  <int> <chr>    <dbl> <dbl> <dbl> <chr>     
1   106 HUP         21     0   153 Neurology 
2   107 HUP         63     1    89 Radiology 
3   108 Stanford    22     0   112 Emergency 
4   109 UCSF        19     0   228 Cardiology

Note that the resulting table only contains cases found in both datasets, i.e. IDs 106 through 109

41.2 Outer join

ab_outer <- full_join(a, b)
Joining with `by = join_by(PID)`
ab_outer
# A tibble: 12 × 6
     PID Hospital   Age   Sex    V1 Department
   <int> <chr>    <dbl> <dbl> <dbl> <chr>     
 1   101 UCSF        22     1    NA <NA>      
 2   102 HUP         34     1    NA <NA>      
 3   103 Stanford    41     0    NA <NA>      
 4   104 Stanford    19     1    NA <NA>      
 5   105 UCSF        53     0    NA <NA>      
 6   106 HUP         21     0   153 Neurology 
 7   107 HUP         63     1    89 Radiology 
 8   108 Stanford    22     0   112 Emergency 
 9   109 UCSF        19     0   228 Cardiology
10   110 <NA>        NA    NA    91 Surgery   
11   111 <NA>        NA    NA   190 Neurology 
12   112 <NA>        NA    NA   101 Psychiatry

Note that the resulting data frame contains all cases found in either dataset and missing values are represented with NA.

41.3 Left outer join

ab_leftOuter <- left_join(a, b)
Joining with `by = join_by(PID)`
ab_leftOuter
# A tibble: 9 × 6
    PID Hospital   Age   Sex    V1 Department
  <int> <chr>    <dbl> <dbl> <dbl> <chr>     
1   101 UCSF        22     1    NA <NA>      
2   102 HUP         34     1    NA <NA>      
3   103 Stanford    41     0    NA <NA>      
4   104 Stanford    19     1    NA <NA>      
5   105 UCSF        53     0    NA <NA>      
6   106 HUP         21     0   153 Neurology 
7   107 HUP         63     1    89 Radiology 
8   108 Stanford    22     0   112 Emergency 
9   109 UCSF        19     0   228 Cardiology

Note that the resulting data frame contains all cases present in the left input dataset (i.e. the one defined first in the arguments) only.

41.4 Right outer join

ab_rightOuter <- right_join(a, b)
Joining with `by = join_by(PID)`
ab_rightOuter
# A tibble: 7 × 6
    PID Hospital   Age   Sex    V1 Department
  <int> <chr>    <dbl> <dbl> <dbl> <chr>     
1   106 HUP         21     0   153 Neurology 
2   107 HUP         63     1    89 Radiology 
3   108 Stanford    22     0   112 Emergency 
4   109 UCSF        19     0   228 Cardiology
5   110 <NA>        NA    NA    91 Surgery   
6   111 <NA>        NA    NA   190 Neurology 
7   112 <NA>        NA    NA   101 Psychiatry

Note how the resulting data frame contains all cases present in the right input dataset (i.e. the one defined second in the arguments) only.

41.5 Specifying columns

If the ID columns in the two data.frames to be merged do not have the same name, you can specify column names using the by argument with a slightly peculiar syntax: c("left_by" = "right_by")

a <- tibble(
  PID = 101:109,
  Hospital = c(
    "UCSF", "HUP", "Stanford",
    "Stanford", "UCSF", "HUP", 
    "HUP", "Stanford", "UCSF"),
  Age = c(22, 34, 41, 19, 53, 21, 63, 22, 19),
  Sex = c(1, 1, 0, 1, 0, 0, 1, 0, 0)
)
a
# A tibble: 9 × 4
    PID Hospital   Age   Sex
  <int> <chr>    <dbl> <dbl>
1   101 UCSF        22     1
2   102 HUP         34     1
3   103 Stanford    41     0
4   104 Stanford    19     1
5   105 UCSF        53     0
6   106 HUP         21     0
7   107 HUP         63     1
8   108 Stanford    22     0
9   109 UCSF        19     0
b <- tibble(
  PatientID = 106:112,
  V1 = c(153, 89, 112, 228,  91, 190, 101),
  Department = c(
    "Neurology", "Radiology",
    "Emergency", "Cardiology",
    "Surgery", "Neurology", "Psychiatry"
  )
)
b
# A tibble: 7 × 3
  PatientID    V1 Department
      <int> <dbl> <chr>     
1       106   153 Neurology 
2       107    89 Radiology 
3       108   112 Emergency 
4       109   228 Cardiology
5       110    91 Surgery   
6       111   190 Neurology 
7       112   101 Psychiatry
ab_inner <- inner_join(a, b, by = c("PID" = "PatientID"))
ab_inner
# A tibble: 4 × 6
    PID Hospital   Age   Sex    V1 Department
  <int> <chr>    <dbl> <dbl> <dbl> <chr>     
1   106 HUP         21     0   153 Neurology 
2   107 HUP         63     1    89 Radiology 
3   108 Stanford    22     0   112 Emergency 
4   109 UCSF        19     0   228 Cardiology

41.6 See also

© 2025 E.D. Gennatas