44  Reshaping (tidyr)

Loading required package: dplyr

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
The duckplyr package is configured to fall back to dplyr when it encounters an
incompatibility. Fallback events can be collected and uploaded for analysis to
guide future development. By default, data will be collected but no data will
be uploaded.
ℹ Automatic fallback uploading is not controlled and therefore disabled, see
  `?duckplyr::fallback()`.
✔ Number of reports ready for upload: 6.
→ Review with `duckplyr::fallback_review()`, upload with
  `duckplyr::fallback_upload()`.
ℹ Configure automatic uploading with `duckplyr::fallback_config()`.
✔ Overwriting dplyr methods with duckplyr methods.
ℹ Turn off with `duckplyr::methods_restore()`.

tidyr does not yet translate to native DuckDB operations when working on DuckDB tibbles. / duckplyr data frames. Therefore it will output regular in-memory tibbles. This is likely to change in future versions.

44.1 Long to wide with pivot_wider()

Using the same example seen in Chapter 26:

tb_long <- duckdb_tibble(
  Account_ID = c(
    8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004,
    8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004
  ),
  Age = c(
    67.8017038366664, 42.9198507293701, 46.2301756642422,
    39.665983196671, 67.8017038366664, 42.9198507293701,
    46.2301756642422, 39.665983196671, 67.8017038366664,
    42.9198507293701, 46.2301756642422, 39.665983196671,
    67.8017038366664, 42.9198507293701, 46.2301756642422,
    39.665983196671
  ),
  Admission = c(
    "ED", "Planned", "Planned", "ED", "ED", "Planned",
    "Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", "Planned",
    "Planned", "ED"
  ),
  Lab_key = c(
    "RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC",
    "Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit",
    "Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"
  ),
  Lab_value = c(
    4.63449321082268, 3.34968550627897, 4.27037213597765,
    4.93897736897793, 8374.22887757195, 7612.37380499927,
    8759.27855519425, 6972.28096216548, 36.272693147236,
    40.5716317809522, 39.9888624177955, 39.8786884058422,
    12.6188444991545, 12.1739747363806, 15.1293426442183,
    14.8885696185238
  )
)
tb_long
# A duckplyr data frame: 5 variables
   Account_ID   Age Admission Lab_key    Lab_value
        <dbl> <dbl> <chr>     <chr>          <dbl>
 1       8001  67.8 ED        RBC             4.63
 2       8002  42.9 Planned   RBC             3.35
 3       8003  46.2 Planned   RBC             4.27
 4       8004  39.7 ED        RBC             4.94
 5       8001  67.8 ED        WBC          8374.  
 6       8002  42.9 Planned   WBC          7612.  
 7       8003  46.2 Planned   WBC          8759.  
 8       8004  39.7 ED        WBC          6972.  
 9       8001  67.8 ED        Hematocrit     36.3 
10       8002  42.9 Planned   Hematocrit     40.6 
11       8003  46.2 Planned   Hematocrit     40.0 
12       8004  39.7 ED        Hematocrit     39.9 
13       8001  67.8 ED        Hemoglobin     12.6 
14       8002  42.9 Planned   Hemoglobin     12.2 
15       8003  46.2 Planned   Hemoglobin     15.1 
16       8004  39.7 ED        Hemoglobin     14.9 
tb_wide <- pivot_wider(
        tb_long,
        id_cols = c("Account_ID", "Age", "Admission"),
        names_from = "Lab_key",
        values_from = "Lab_value")
tb_wide
# A tibble: 4 × 7
  Account_ID   Age Admission   RBC   WBC Hematocrit Hemoglobin
       <dbl> <dbl> <chr>     <dbl> <dbl>      <dbl>      <dbl>
1       8001  67.8 ED         4.63 8374.       36.3       12.6
2       8002  42.9 Planned    3.35 7612.       40.6       12.2
3       8003  46.2 Planned    4.27 8759.       40.0       15.1
4       8004  39.7 ED         4.94 6972.       39.9       14.9

44.2 Wide to long with pivot_longer()

44.2.1 Example 1: Timepoints

tb_wide <- tibble(
        ID = 1:4,
        Timepoint_A = 11:14,
        Timepoint_B = 21:24,
        Timepoint_C = 51:54)
tb_wide
# A tibble: 4 × 4
     ID Timepoint_A Timepoint_B Timepoint_C
  <int>       <int>       <int>       <int>
1     1          11          21          51
2     2          12          22          52
3     3          13          23          53
4     4          14          24          54
tb_long <- pivot_longer(tb_wide,
                        cols = starts_with("Timepoint_"),
                        names_to = "Timepoint",
                        values_to = "Score")
tb_long
# A tibble: 12 × 3
      ID Timepoint   Score
   <int> <chr>       <int>
 1     1 Timepoint_A    11
 2     1 Timepoint_B    21
 3     1 Timepoint_C    51
 4     2 Timepoint_A    12
 5     2 Timepoint_B    22
 6     2 Timepoint_C    52
 7     3 Timepoint_A    13
 8     3 Timepoint_B    23
 9     3 Timepoint_C    53
10     4 Timepoint_A    14
11     4 Timepoint_B    24
12     4 Timepoint_C    54

44.2.2 Example 2: Key-value pairs

set.seed(2022)
tb_wide <- tibble(
  Account_ID = c(8001, 8002, 8003, 8004),
  Age = rnorm(4, mean = 57, sd = 12),
  RBC = rnorm(4, mean = 4.8, sd = 0.5),
  WBC = rnorm(4, mean = 7250, sd = 1500),
  Hematocrit = rnorm(4, mean = 40.2, sd = 4),
  Hemoglobin = rnorm(4, mean = 13.6, sd = 1.5),
  Admission = sample(c("ED", "Planned"), size = 4, replace = TRUE)
)
tb_wide
# A tibble: 4 × 7
  Account_ID   Age   RBC   WBC Hematocrit Hemoglobin Admission
       <dbl> <dbl> <dbl> <dbl>      <dbl>      <dbl> <chr>    
1       8001  67.8  4.63 8374.       36.3       12.6 ED       
2       8002  42.9  3.35 7612.       40.6       12.2 Planned  
3       8003  46.2  4.27 8759.       40.0       15.1 Planned  
4       8004  39.7  4.94 6972.       39.9       14.9 Planned  
tb_long <- pivot_longer(tb_wide,
                        cols = 3:6,
                        names_to = "Lab_key",
                        values_to = "Lab_value")
tb_long
# A tibble: 16 × 5
   Account_ID   Age Admission Lab_key    Lab_value
        <dbl> <dbl> <chr>     <chr>          <dbl>
 1       8001  67.8 ED        RBC             4.63
 2       8001  67.8 ED        WBC          8374.  
 3       8001  67.8 ED        Hematocrit     36.3 
 4       8001  67.8 ED        Hemoglobin     12.6 
 5       8002  42.9 Planned   RBC             3.35
 6       8002  42.9 Planned   WBC          7612.  
 7       8002  42.9 Planned   Hematocrit     40.6 
 8       8002  42.9 Planned   Hemoglobin     12.2 
 9       8003  46.2 Planned   RBC             4.27
10       8003  46.2 Planned   WBC          8759.  
11       8003  46.2 Planned   Hematocrit     40.0 
12       8003  46.2 Planned   Hemoglobin     15.1 
13       8004  39.7 Planned   RBC             4.94
14       8004  39.7 Planned   WBC          6972.  
15       8004  39.7 Planned   Hematocrit     39.9 
16       8004  39.7 Planned   Hemoglobin     14.9 

or using column names:

tb_long <- pivot_longer(tb_wide,
                        cols = c("RBC", "WBC", "Hematocrit", "Hemoglobin"),
                        names_to = "Lab_key",
                        values_to = "Lab_value")
tb_long
# A tibble: 16 × 5
   Account_ID   Age Admission Lab_key    Lab_value
        <dbl> <dbl> <chr>     <chr>          <dbl>
 1       8001  67.8 ED        RBC             4.63
 2       8001  67.8 ED        WBC          8374.  
 3       8001  67.8 ED        Hematocrit     36.3 
 4       8001  67.8 ED        Hemoglobin     12.6 
 5       8002  42.9 Planned   RBC             3.35
 6       8002  42.9 Planned   WBC          7612.  
 7       8002  42.9 Planned   Hematocrit     40.6 
 8       8002  42.9 Planned   Hemoglobin     12.2 
 9       8003  46.2 Planned   RBC             4.27
10       8003  46.2 Planned   WBC          8759.  
11       8003  46.2 Planned   Hematocrit     40.0 
12       8003  46.2 Planned   Hemoglobin     15.1 
13       8004  39.7 Planned   RBC             4.94
14       8004  39.7 Planned   WBC          6972.  
15       8004  39.7 Planned   Hematocrit     39.9 
16       8004  39.7 Planned   Hemoglobin     14.9 

Note that the “Age” and “Admission” columns were also reshaped correctly.

44.3 See also

© 2025 E.D. Gennatas