47 Reshaping
47.1 Long to wide using pivot()
47.1.1 Example 1: key-value pairs
Using the same example seen in Chapter 26, we start by showing as_polars_df() to convert a data.frame to a polars DataFrame:
df_long <- data.frame(
Account_ID = as.integer(c(8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004,
8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004)),
Age = c(67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671, 67.8017038366664, 42.9198507293701,
46.2301756642422, 39.665983196671, 67.8017038366664,
42.9198507293701, 46.2301756642422, 39.665983196671,
67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671),
Admission = c("ED", "Planned", "Planned", "ED", "ED", "Planned",
"Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", "Planned",
"Planned", "ED"),
Lab_key = c("RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC",
"Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit",
"Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"),
Lab_value = c(4.63449321082268, 3.34968550627897, 4.27037213597765,
4.93897736897793, 8374.22887757195, 7612.37380499927,
8759.27855519425, 6972.28096216548, 36.272693147236,
40.5716317809522, 39.9888624177955, 39.8786884058422,
12.6188444991545, 12.1739747363806, 15.1293426442183,
14.8885696185238)
)
pl_long <- as_polars_df(df_long)
pl_longshape: (16, 5)
┌────────────┬───────────┬───────────┬────────────┬─────────────┐
│ Account_ID ┆ Age ┆ Admission ┆ Lab_key ┆ Lab_value │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ f64 ┆ str ┆ str ┆ f64 │
╞════════════╪═══════════╪═══════════╪════════════╪═════════════╡
│ 8001 ┆ 67.801704 ┆ ED ┆ RBC ┆ 4.634493 │
│ 8002 ┆ 42.919851 ┆ Planned ┆ RBC ┆ 3.349686 │
│ 8003 ┆ 46.230176 ┆ Planned ┆ RBC ┆ 4.270372 │
│ 8004 ┆ 39.665983 ┆ ED ┆ RBC ┆ 4.938977 │
│ 8001 ┆ 67.801704 ┆ ED ┆ WBC ┆ 8374.228878 │
│ … ┆ … ┆ … ┆ … ┆ … │
│ 8004 ┆ 39.665983 ┆ ED ┆ Hematocrit ┆ 39.878688 │
│ 8001 ┆ 67.801704 ┆ ED ┆ Hemoglobin ┆ 12.618844 │
│ 8002 ┆ 42.919851 ┆ Planned ┆ Hemoglobin ┆ 12.173975 │
│ 8003 ┆ 46.230176 ┆ Planned ┆ Hemoglobin ┆ 15.129343 │
│ 8004 ┆ 39.665983 ┆ ED ┆ Hemoglobin ┆ 14.88857 │
└────────────┴───────────┴───────────┴────────────┴─────────────┘
Polars provides the pivot() function to reshape data from long to wide format:
pl_wide <- pl_long$pivot(
values = "Lab_value",
index = c("Account_ID", "Age", "Admission"),
on = "Lab_key"
)
pl_wideshape: (4, 7)
┌────────────┬───────────┬───────────┬──────────┬─────────────┬────────────┬────────────┐
│ Account_ID ┆ Age ┆ Admission ┆ RBC ┆ WBC ┆ Hematocrit ┆ Hemoglobin │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞════════════╪═══════════╪═══════════╪══════════╪═════════════╪════════════╪════════════╡
│ 8001 ┆ 67.801704 ┆ ED ┆ 4.634493 ┆ 8374.228878 ┆ 36.272693 ┆ 12.618844 │
│ 8002 ┆ 42.919851 ┆ Planned ┆ 3.349686 ┆ 7612.373805 ┆ 40.571632 ┆ 12.173975 │
│ 8003 ┆ 46.230176 ┆ Planned ┆ 4.270372 ┆ 8759.278555 ┆ 39.988862 ┆ 15.129343 │
│ 8004 ┆ 39.665983 ┆ ED ┆ 4.938977 ┆ 6972.280962 ┆ 39.878688 ┆ 14.88857 │
└────────────┴───────────┴───────────┴──────────┴─────────────┴────────────┴────────────┘
47.1.2 Example 2: Timepoints
Another simple synthetic dataset, this time using pl$DataFrame() to create a polars DataFrame directly:
shape: (12, 3)
┌─────┬─────────────┬───────┐
│ ID ┆ Timepoint ┆ Score │
│ --- ┆ --- ┆ --- │
│ i32 ┆ cat ┆ i32 │
╞═════╪═════════════╪═══════╡
│ 1 ┆ Timepoint_A ┆ 11 │
│ 2 ┆ Timepoint_A ┆ 12 │
│ 3 ┆ Timepoint_A ┆ 13 │
│ 4 ┆ Timepoint_A ┆ 14 │
│ 1 ┆ Timepoint_B ┆ 21 │
│ … ┆ … ┆ … │
│ 4 ┆ Timepoint_B ┆ 24 │
│ 1 ┆ Timepoint_C ┆ 51 │
│ 2 ┆ Timepoint_C ┆ 52 │
│ 3 ┆ Timepoint_C ┆ 53 │
│ 4 ┆ Timepoint_C ┆ 54 │
└─────┴─────────────┴───────┘
Using the pivot() function again:
pl_wide <- pl_long$pivot(
values = "Score",
index = "ID",
on = "Timepoint"
)
pl_wideshape: (4, 4)
┌─────┬─────────────┬─────────────┬─────────────┐
│ ID ┆ Timepoint_A ┆ Timepoint_B ┆ Timepoint_C │
│ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ i32 ┆ i32 ┆ i32 │
╞═════╪═════════════╪═════════════╪═════════════╡
│ 1 ┆ 11 ┆ 21 ┆ 51 │
│ 2 ┆ 12 ┆ 22 ┆ 52 │
│ 3 ┆ 13 ┆ 23 ┆ 53 │
│ 4 ┆ 14 ┆ 24 ┆ 54 │
└─────┴─────────────┴─────────────┴─────────────┘
47.1.3 pivot() + aggregate
If your data contains multiple entries for the same combination of index and column values, you can use the aggregate_function argument to specify how to aggregate these values.
For example, suppose you have four subjects with IDs “A”, “B”, “C”, “D” who had a couple variables measured 3 times in the AM and 3 times in the PM.
shape: (24, 4)
┌─────┬───────────┬───────────┬───────────┐
│ ID ┆ Timepoint ┆ Var1 ┆ Var2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 ┆ f64 │
╞═════╪═══════════╪═══════════╪═══════════╡
│ A ┆ AM ┆ 9.364677 ┆ 20.16012 │
│ A ┆ AM ┆ 10.55661 ┆ 20.173266 │
│ A ┆ AM ┆ 10.566927 ┆ 20.76316 │
│ A ┆ PM ┆ 8.563775 ┆ 22.37146 │
│ A ┆ PM ┆ 8.304662 ┆ 19.563726 │
│ … ┆ … ┆ … ┆ … │
│ D ┆ AM ┆ 10.289381 ┆ 19.546063 │
│ D ┆ AM ┆ 9.881024 ┆ 19.676375 │
│ D ┆ PM ┆ 9.752491 ┆ 18.249215 │
│ D ┆ PM ┆ 13.147432 ┆ 20.090258 │
│ D ┆ PM ┆ 9.438623 ┆ 20.937169 │
└─────┴───────────┴───────────┴───────────┘
To convert the above DataFrame to wide format and get mean AM and PM values using the aggregate_function argument:
pl_wide <- pl_long$pivot(
values = c("Var1", "Var2"),
index = "ID",
on = "Timepoint",
aggregate_function = "mean"
)
pl_wideshape: (4, 5)
┌─────┬───────────┬───────────┬───────────┬───────────┐
│ ID ┆ Var1_AM ┆ Var1_PM ┆ Var2_AM ┆ Var2_PM │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ A ┆ 10.162738 ┆ 8.989228 ┆ 20.365515 ┆ 20.859754 │
│ B ┆ 9.752661 ┆ 8.919559 ┆ 19.612417 ┆ 20.024272 │
│ C ┆ 10.064019 ┆ 9.36731 ┆ 20.097893 ┆ 19.624808 │
│ D ┆ 9.801179 ┆ 10.779516 ┆ 19.483709 ┆ 19.758881 │
└─────┴───────────┴───────────┴───────────┴───────────┘
Here, “mean” is a convenience string that is equivalent to the more verbose pl\(element()\)mean(). Other convenience strings include “first”, “last”, “min”, “max”, “median”, “sum”, and “count”.
47.2 Wide to long using unpivot()
47.2.1 Example 1: Timepoints
pl_wide <- pl$DataFrame(
ID = 1:4,
Timepoint_A = 11:14,
Timepoint_B = 21:24,
Timepoint_C = 51:54
)
pl_wideshape: (4, 4)
┌─────┬─────────────┬─────────────┬─────────────┐
│ ID ┆ Timepoint_A ┆ Timepoint_B ┆ Timepoint_C │
│ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ i32 ┆ i32 ┆ i32 │
╞═════╪═════════════╪═════════════╪═════════════╡
│ 1 ┆ 11 ┆ 21 ┆ 51 │
│ 2 ┆ 12 ┆ 22 ┆ 52 │
│ 3 ┆ 13 ┆ 23 ┆ 53 │
│ 4 ┆ 14 ┆ 24 ┆ 54 │
└─────┴─────────────┴─────────────┴─────────────┘
Use unpivot() to convert from wide to long format:
pl_long <- pl_wide$unpivot(
index = "ID",
on = c("Timepoint_A", "Timepoint_B", "Timepoint_C"),
variable_name = "Timepoint",
value_name = "Score"
)
pl_longshape: (12, 3)
┌─────┬─────────────┬───────┐
│ ID ┆ Timepoint ┆ Score │
│ --- ┆ --- ┆ --- │
│ i32 ┆ str ┆ i32 │
╞═════╪═════════════╪═══════╡
│ 1 ┆ Timepoint_A ┆ 11 │
│ 2 ┆ Timepoint_A ┆ 12 │
│ 3 ┆ Timepoint_A ┆ 13 │
│ 4 ┆ Timepoint_A ┆ 14 │
│ 1 ┆ Timepoint_B ┆ 21 │
│ … ┆ … ┆ … │
│ 4 ┆ Timepoint_B ┆ 24 │
│ 1 ┆ Timepoint_C ┆ 51 │
│ 2 ┆ Timepoint_C ┆ 52 │
│ 3 ┆ Timepoint_C ┆ 53 │
│ 4 ┆ Timepoint_C ┆ 54 │
└─────┴─────────────┴───────┘
47.2.2 Example 2: Key-value pairs
Using the same synthetic data as in Chapter 26:
set.seed(2022)
pl_wide <- pl$DataFrame(
Account_ID = c(8001, 8002, 8003, 8004),
Age = rnorm(4, mean = 57, sd = 12),
RBC = rnorm(4, mean = 4.8, sd = 0.5),
WBC = rnorm(4, mean = 7250, sd = 1500),
Hematocrit = rnorm(4, mean = 40.2, sd = 4),
Hemoglobin = rnorm(4, mean = 13.6, sd = 1.5),
Admission = sample(c("ED", "Planned"), size = 4, replace = TRUE)
)
pl_wideshape: (4, 7)
┌────────────┬───────────┬──────────┬─────────────┬────────────┬────────────┬───────────┐
│ Account_ID ┆ Age ┆ RBC ┆ WBC ┆ Hematocrit ┆ Hemoglobin ┆ Admission │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │
╞════════════╪═══════════╪══════════╪═════════════╪════════════╪════════════╪═══════════╡
│ 8001.0 ┆ 67.801704 ┆ 4.634493 ┆ 8374.228878 ┆ 36.272693 ┆ 12.618844 ┆ ED │
│ 8002.0 ┆ 42.919851 ┆ 3.349686 ┆ 7612.373805 ┆ 40.571632 ┆ 12.173975 ┆ Planned │
│ 8003.0 ┆ 46.230176 ┆ 4.270372 ┆ 8759.278555 ┆ 39.988862 ┆ 15.129343 ┆ Planned │
│ 8004.0 ┆ 39.665983 ┆ 4.938977 ┆ 6972.280962 ┆ 39.878688 ┆ 14.88857 ┆ Planned │
└────────────┴───────────┴──────────┴─────────────┴────────────┴────────────┴───────────┘
Convert from wide to long format using unpivot() again:
shape: (16, 5)
┌────────────┬───────────┬───────────┬────────────┬─────────────┐
│ Account_ID ┆ Age ┆ Admission ┆ Lab_key ┆ Lab_value │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str ┆ str ┆ f64 │
╞════════════╪═══════════╪═══════════╪════════════╪═════════════╡
│ 8001.0 ┆ 67.801704 ┆ ED ┆ RBC ┆ 4.634493 │
│ 8002.0 ┆ 42.919851 ┆ Planned ┆ RBC ┆ 3.349686 │
│ 8003.0 ┆ 46.230176 ┆ Planned ┆ RBC ┆ 4.270372 │
│ 8004.0 ┆ 39.665983 ┆ Planned ┆ RBC ┆ 4.938977 │
│ 8001.0 ┆ 67.801704 ┆ ED ┆ WBC ┆ 8374.228878 │
│ … ┆ … ┆ … ┆ … ┆ … │
│ 8004.0 ┆ 39.665983 ┆ Planned ┆ Hematocrit ┆ 39.878688 │
│ 8001.0 ┆ 67.801704 ┆ ED ┆ Hemoglobin ┆ 12.618844 │
│ 8002.0 ┆ 42.919851 ┆ Planned ┆ Hemoglobin ┆ 12.173975 │
│ 8003.0 ┆ 46.230176 ┆ Planned ┆ Hemoglobin ┆ 15.129343 │
│ 8004.0 ┆ 39.665983 ┆ Planned ┆ Hemoglobin ┆ 14.88857 │
└────────────┴───────────┴───────────┴────────────┴─────────────┘