47 Reshaping

library(polars)

47.1 Long to wide using `pivot()`

47.1.1 Example 1: key-value pairs

Using the same example seen in Chapter 26, we start by showing as_polars_df() to convert a data.frame to a polars DataFrame:

df_long <- data.frame(
    Account_ID = as.integer(c(8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004, 
            8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004)), 
    Age = c(67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671, 67.8017038366664, 42.9198507293701, 
            46.2301756642422, 39.665983196671, 67.8017038366664, 
            42.9198507293701, 46.2301756642422, 39.665983196671, 
            67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671), 
    Admission = c("ED", "Planned", "Planned", "ED", "ED", "Planned", 
            "Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", "Planned", 
            "Planned", "ED"), 
    Lab_key = c("RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC", 
            "Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit", 
            "Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"), 
    Lab_value = c(4.63449321082268, 3.34968550627897, 4.27037213597765, 
            4.93897736897793, 8374.22887757195, 7612.37380499927, 
            8759.27855519425, 6972.28096216548, 36.272693147236, 
            40.5716317809522, 39.9888624177955, 39.8786884058422, 
            12.6188444991545, 12.1739747363806, 15.1293426442183, 
            14.8885696185238)
)
pl_long <- as_polars_df(df_long)
pl_long

shape: (16, 5)
┌────────────┬───────────┬───────────┬────────────┬─────────────┐
│ Account_ID ┆ Age       ┆ Admission ┆ Lab_key    ┆ Lab_value   │
│ ---        ┆ ---       ┆ ---       ┆ ---        ┆ ---         │
│ i32        ┆ f64       ┆ str       ┆ str        ┆ f64         │
╞════════════╪═══════════╪═══════════╪════════════╪═════════════╡
│ 8001       ┆ 67.801704 ┆ ED        ┆ RBC        ┆ 4.634493    │
│ 8002       ┆ 42.919851 ┆ Planned   ┆ RBC        ┆ 3.349686    │
│ 8003       ┆ 46.230176 ┆ Planned   ┆ RBC        ┆ 4.270372    │
│ 8004       ┆ 39.665983 ┆ ED        ┆ RBC        ┆ 4.938977    │
│ 8001       ┆ 67.801704 ┆ ED        ┆ WBC        ┆ 8374.228878 │
│ …          ┆ …         ┆ …         ┆ …          ┆ …           │
│ 8004       ┆ 39.665983 ┆ ED        ┆ Hematocrit ┆ 39.878688   │
│ 8001       ┆ 67.801704 ┆ ED        ┆ Hemoglobin ┆ 12.618844   │
│ 8002       ┆ 42.919851 ┆ Planned   ┆ Hemoglobin ┆ 12.173975   │
│ 8003       ┆ 46.230176 ┆ Planned   ┆ Hemoglobin ┆ 15.129343   │
│ 8004       ┆ 39.665983 ┆ ED        ┆ Hemoglobin ┆ 14.88857    │
└────────────┴───────────┴───────────┴────────────┴─────────────┘

Polars provides the pivot() function to reshape data from long to wide format:

pl_wide <- pl_long$pivot(
  values = "Lab_value",
  index = c("Account_ID", "Age", "Admission"),
  on = "Lab_key"
)
pl_wide

shape: (4, 7)
┌────────────┬───────────┬───────────┬──────────┬─────────────┬────────────┬────────────┐
│ Account_ID ┆ Age       ┆ Admission ┆ RBC      ┆ WBC         ┆ Hematocrit ┆ Hemoglobin │
│ ---        ┆ ---       ┆ ---       ┆ ---      ┆ ---         ┆ ---        ┆ ---        │
│ i32        ┆ f64       ┆ str       ┆ f64      ┆ f64         ┆ f64        ┆ f64        │
╞════════════╪═══════════╪═══════════╪══════════╪═════════════╪════════════╪════════════╡
│ 8001       ┆ 67.801704 ┆ ED        ┆ 4.634493 ┆ 8374.228878 ┆ 36.272693  ┆ 12.618844  │
│ 8002       ┆ 42.919851 ┆ Planned   ┆ 3.349686 ┆ 7612.373805 ┆ 40.571632  ┆ 12.173975  │
│ 8003       ┆ 46.230176 ┆ Planned   ┆ 4.270372 ┆ 8759.278555 ┆ 39.988862  ┆ 15.129343  │
│ 8004       ┆ 39.665983 ┆ ED        ┆ 4.938977 ┆ 6972.280962 ┆ 39.878688  ┆ 14.88857   │
└────────────┴───────────┴───────────┴──────────┴─────────────┴────────────┴────────────┘

47.1.2 Example 2: Timepoints

Another simple synthetic dataset, this time using pl$DataFrame() to create a polars DataFrame directly:

pl_long <- pl$DataFrame(
  ID = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L),
  Timepoint = factor(
    c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L),
    labels = c("Timepoint_A", "Timepoint_B", "Timepoint_C")
  ),
  Score = c(11L, 12L, 13L, 14L, 21L, 22L, 23L, 24L, 51L, 52L, 53L, 54L)
)
pl_long

shape: (12, 3)
┌─────┬─────────────┬───────┐
│ ID  ┆ Timepoint   ┆ Score │
│ --- ┆ ---         ┆ ---   │
│ i32 ┆ cat         ┆ i32   │
╞═════╪═════════════╪═══════╡
│ 1   ┆ Timepoint_A ┆ 11    │
│ 2   ┆ Timepoint_A ┆ 12    │
│ 3   ┆ Timepoint_A ┆ 13    │
│ 4   ┆ Timepoint_A ┆ 14    │
│ 1   ┆ Timepoint_B ┆ 21    │
│ …   ┆ …           ┆ …     │
│ 4   ┆ Timepoint_B ┆ 24    │
│ 1   ┆ Timepoint_C ┆ 51    │
│ 2   ┆ Timepoint_C ┆ 52    │
│ 3   ┆ Timepoint_C ┆ 53    │
│ 4   ┆ Timepoint_C ┆ 54    │
└─────┴─────────────┴───────┘

Using the pivot() function again:

pl_wide <- pl_long$pivot(
  values = "Score",
  index = "ID",
  on = "Timepoint"
)
pl_wide

shape: (4, 4)
┌─────┬─────────────┬─────────────┬─────────────┐
│ ID  ┆ Timepoint_A ┆ Timepoint_B ┆ Timepoint_C │
│ --- ┆ ---         ┆ ---         ┆ ---         │
│ i32 ┆ i32         ┆ i32         ┆ i32         │
╞═════╪═════════════╪═════════════╪═════════════╡
│ 1   ┆ 11          ┆ 21          ┆ 51          │
│ 2   ┆ 12          ┆ 22          ┆ 52          │
│ 3   ┆ 13          ┆ 23          ┆ 53          │
│ 4   ┆ 14          ┆ 24          ┆ 54          │
└─────┴─────────────┴─────────────┴─────────────┘

47.1.3 `pivot()` + aggregate

If your data contains multiple entries for the same combination of index and column values, you can use the aggregate_function argument to specify how to aggregate these values.

For example, suppose you have four subjects with IDs “A”, “B”, “C”, “D” who had a couple variables measured 3 times in the AM and 3 times in the PM.

pl_long <- pl$DataFrame(
  ID = rep(LETTERS[1:4], each = 6),
  Timepoint = rep(c("AM", "PM"), length.out = 24, each = 3),
  Var1 = rnorm(24, mean = 10),
  Var2 = rnorm(24, mean = 20)
)
pl_long

shape: (24, 4)
┌─────┬───────────┬───────────┬───────────┐
│ ID  ┆ Timepoint ┆ Var1      ┆ Var2      │
│ --- ┆ ---       ┆ ---       ┆ ---       │
│ str ┆ str       ┆ f64       ┆ f64       │
╞═════╪═══════════╪═══════════╪═══════════╡
│ A   ┆ AM        ┆ 9.699792  ┆ 18.773582 │
│ A   ┆ AM        ┆ 9.000198  ┆ 19.306904 │
│ A   ┆ AM        ┆ 10.46846  ┆ 19.962325 │
│ A   ┆ PM        ┆ 9.607685  ┆ 20.810123 │
│ A   ┆ PM        ┆ 9.414937  ┆ 20.227719 │
│ …   ┆ …         ┆ …         ┆ …         │
│ D   ┆ AM        ┆ 9.573165  ┆ 18.634142 │
│ D   ┆ AM        ┆ 11.348555 ┆ 20.77983  │
│ D   ┆ PM        ┆ 9.007919  ┆ 18.427543 │
│ D   ┆ PM        ┆ 10.141778 ┆ 21.090621 │
│ D   ┆ PM        ┆ 8.472125  ┆ 21.615966 │
└─────┴───────────┴───────────┴───────────┘

To convert the above DataFrame to wide format and get mean AM and PM values using the aggregate_function argument:

pl_wide <- pl_long$pivot(
  values = c("Var1", "Var2"),
  index = "ID",
  on = "Timepoint",
  aggregate_function = "mean"
)
pl_wide

shape: (4, 5)
┌─────┬───────────┬───────────┬───────────┬───────────┐
│ ID  ┆ Var1_AM   ┆ Var1_PM   ┆ Var2_AM   ┆ Var2_PM   │
│ --- ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ str ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞═════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ A   ┆ 9.722817  ┆ 9.559052  ┆ 19.347603 ┆ 20.423999 │
│ B   ┆ 10.195484 ┆ 10.407775 ┆ 20.321431 ┆ 19.956038 │
│ C   ┆ 10.852523 ┆ 10.468213 ┆ 19.842704 ┆ 19.832144 │
│ D   ┆ 10.415963 ┆ 9.207274  ┆ 19.64846  ┆ 20.378043 │
└─────┴───────────┴───────────┴───────────┴───────────┘

Here, “mean” is a convenience string that is equivalent to the more verbose pl$element()$mean(). Other convenience strings include “first”, “last”, “min”, “max”, “median”, “sum”, and “count”.

47.2 Wide to long using `unpivot()`

47.2.1 Example 1: Timepoints

pl_wide <- pl$DataFrame(
    ID = 1:4,
    Timepoint_A = 11:14,
    Timepoint_B = 21:24,
    Timepoint_C = 51:54
)
pl_wide

shape: (4, 4)
┌─────┬─────────────┬─────────────┬─────────────┐
│ ID  ┆ Timepoint_A ┆ Timepoint_B ┆ Timepoint_C │
│ --- ┆ ---         ┆ ---         ┆ ---         │
│ i32 ┆ i32         ┆ i32         ┆ i32         │
╞═════╪═════════════╪═════════════╪═════════════╡
│ 1   ┆ 11          ┆ 21          ┆ 51          │
│ 2   ┆ 12          ┆ 22          ┆ 52          │
│ 3   ┆ 13          ┆ 23          ┆ 53          │
│ 4   ┆ 14          ┆ 24          ┆ 54          │
└─────┴─────────────┴─────────────┴─────────────┘

Use unpivot() to convert from wide to long format:

pl_long <- pl_wide$unpivot(
  index = "ID",
  on = c("Timepoint_A", "Timepoint_B", "Timepoint_C"),
  variable_name = "Timepoint",
  value_name = "Score"
)
pl_long

shape: (12, 3)
┌─────┬─────────────┬───────┐
│ ID  ┆ Timepoint   ┆ Score │
│ --- ┆ ---         ┆ ---   │
│ i32 ┆ str         ┆ i32   │
╞═════╪═════════════╪═══════╡
│ 1   ┆ Timepoint_A ┆ 11    │
│ 2   ┆ Timepoint_A ┆ 12    │
│ 3   ┆ Timepoint_A ┆ 13    │
│ 4   ┆ Timepoint_A ┆ 14    │
│ 1   ┆ Timepoint_B ┆ 21    │
│ …   ┆ …           ┆ …     │
│ 4   ┆ Timepoint_B ┆ 24    │
│ 1   ┆ Timepoint_C ┆ 51    │
│ 2   ┆ Timepoint_C ┆ 52    │
│ 3   ┆ Timepoint_C ┆ 53    │
│ 4   ┆ Timepoint_C ┆ 54    │
└─────┴─────────────┴───────┘

47.2.2 Example 2: Key-value pairs

Using the same synthetic data as in Chapter 26:

set.seed(2022)
pl_wide <- pl$DataFrame(
  Account_ID = c(8001, 8002, 8003, 8004),
  Age = rnorm(4, mean = 57, sd = 12),
  RBC = rnorm(4, mean = 4.8, sd = 0.5),
  WBC = rnorm(4, mean = 7250, sd = 1500),
  Hematocrit = rnorm(4, mean = 40.2, sd = 4),
  Hemoglobin = rnorm(4, mean = 13.6, sd = 1.5),
  Admission = sample(c("ED", "Planned"), size = 4, replace = TRUE)
)
pl_wide

shape: (4, 7)
┌────────────┬───────────┬──────────┬─────────────┬────────────┬────────────┬───────────┐
│ Account_ID ┆ Age       ┆ RBC      ┆ WBC         ┆ Hematocrit ┆ Hemoglobin ┆ Admission │
│ ---        ┆ ---       ┆ ---      ┆ ---         ┆ ---        ┆ ---        ┆ ---       │
│ f64        ┆ f64       ┆ f64      ┆ f64         ┆ f64        ┆ f64        ┆ str       │
╞════════════╪═══════════╪══════════╪═════════════╪════════════╪════════════╪═══════════╡
│ 8001.0     ┆ 67.801704 ┆ 4.634493 ┆ 8374.228878 ┆ 36.272693  ┆ 12.618844  ┆ ED        │
│ 8002.0     ┆ 42.919851 ┆ 3.349686 ┆ 7612.373805 ┆ 40.571632  ┆ 12.173975  ┆ Planned   │
│ 8003.0     ┆ 46.230176 ┆ 4.270372 ┆ 8759.278555 ┆ 39.988862  ┆ 15.129343  ┆ Planned   │
│ 8004.0     ┆ 39.665983 ┆ 4.938977 ┆ 6972.280962 ┆ 39.878688  ┆ 14.88857   ┆ Planned   │
└────────────┴───────────┴──────────┴─────────────┴────────────┴────────────┴───────────┘

Convert from wide to long format using unpivot() again:

pl_long <- pl_wide$unpivot(
  index = c("Account_ID", "Age", "Admission"),
  on = c("RBC", "WBC", "Hematocrit", "Hemoglobin"),
  variable_name = "Lab_key",
  value_name = "Lab_value"
)
pl_long

shape: (16, 5)
┌────────────┬───────────┬───────────┬────────────┬─────────────┐
│ Account_ID ┆ Age       ┆ Admission ┆ Lab_key    ┆ Lab_value   │
│ ---        ┆ ---       ┆ ---       ┆ ---        ┆ ---         │
│ f64        ┆ f64       ┆ str       ┆ str        ┆ f64         │
╞════════════╪═══════════╪═══════════╪════════════╪═════════════╡
│ 8001.0     ┆ 67.801704 ┆ ED        ┆ RBC        ┆ 4.634493    │
│ 8002.0     ┆ 42.919851 ┆ Planned   ┆ RBC        ┆ 3.349686    │
│ 8003.0     ┆ 46.230176 ┆ Planned   ┆ RBC        ┆ 4.270372    │
│ 8004.0     ┆ 39.665983 ┆ Planned   ┆ RBC        ┆ 4.938977    │
│ 8001.0     ┆ 67.801704 ┆ ED        ┆ WBC        ┆ 8374.228878 │
│ …          ┆ …         ┆ …         ┆ …          ┆ …           │
│ 8004.0     ┆ 39.665983 ┆ Planned   ┆ Hematocrit ┆ 39.878688   │
│ 8001.0     ┆ 67.801704 ┆ ED        ┆ Hemoglobin ┆ 12.618844   │
│ 8002.0     ┆ 42.919851 ┆ Planned   ┆ Hemoglobin ┆ 12.173975   │
│ 8003.0     ┆ 46.230176 ┆ Planned   ┆ Hemoglobin ┆ 15.129343   │
│ 8004.0     ┆ 39.665983 ┆ Planned   ┆ Hemoglobin ┆ 14.88857    │
└────────────┴───────────┴───────────┴────────────┴─────────────┘

47.2.3 References

Polars reshape

47.1 Long to wide using pivot()

47.1.1 Example 1: key-value pairs

47.1.2 Example 2: Timepoints

47.1.3 pivot() + aggregate

47.2 Wide to long using unpivot()

47.2.1 Example 1: Timepoints

47.2.2 Example 2: Key-value pairs

47.2.3 References

47.1 Long to wide using `pivot()`

47.1.3 `pivot()` + aggregate

47.2 Wide to long using `unpivot()`