Longitudinal data organization • alda

library(alda)
library(dplyr)
library(tidyr)

Longitudinal data formats

Longitudinal data can be organized into two distinct formats:

A person-level, wide, or multivariate, format where each person has only one row of data and multiple columns containing data from each measurement occasion.

glimpse(deviant_tolerance_pl)
#> Rows: 16
#> Columns: 8
#> $ id           <fct> 9, 45, 268, 314, 442, 514, 569, 624, 723, 918, 949, 978, …
#> $ tolerance_11 <dbl> 2.23, 1.12, 1.45, 1.22, 1.45, 1.34, 1.79, 1.12, 1.22, 1.0…
#> $ tolerance_12 <dbl> 1.79, 1.45, 1.34, 1.22, 1.99, 1.67, 1.90, 1.12, 1.34, 1.0…
#> $ tolerance_13 <dbl> 1.90, 1.45, 1.99, 1.55, 1.45, 2.23, 1.90, 1.22, 1.12, 1.2…
#> $ tolerance_14 <dbl> 2.12, 1.45, 1.79, 1.12, 1.67, 2.12, 1.99, 1.12, 1.00, 1.9…
#> $ tolerance_15 <dbl> 2.66, 1.99, 1.34, 1.12, 1.90, 2.44, 1.99, 1.22, 1.12, 1.2…
#> $ male         <dbl> 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0
#> $ exposure     <dbl> 1.54, 1.16, 0.90, 0.81, 1.13, 0.90, 1.99, 0.98, 0.81, 1.2…

A person-period, long, or univariate, format where each person has one row of data for each measurement occasion.

glimpse(deviant_tolerance_pp)
#> Rows: 80
#> Columns: 5
#> $ id        <fct> 9, 9, 9, 9, 9, 45, 45, 45, 45, 45, 268, 268, 268, 268, 268, …
#> $ age       <dbl> 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, …
#> $ tolerance <dbl> 2.23, 1.79, 1.90, 2.12, 2.66, 1.12, 1.45, 1.45, 1.45, 1.99, …
#> $ male      <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
#> $ exposure  <dbl> 1.54, 1.54, 1.54, 1.54, 1.54, 1.16, 1.16, 1.16, 1.16, 1.16, …

Most R functions expect data to be in the person-period format for visualization and analysis, but it’s easy to convert a longitudinal data set from one format to the other.

Converting between formats

To convert a person-level data set to person-period format use dplyr::pivot_longer():

pivot_longer(
  deviant_tolerance_pl,
  cols = starts_with("tolerance_"),
  names_to = "age",
  names_pattern = "([[:digit:]]+)",
  names_transform = as.integer,
  values_to = "tolerance"
)
#> # A tibble: 80 × 5
#>    id     male exposure   age tolerance
#>    <fct> <dbl>    <dbl> <int>     <dbl>
#>  1 9         0     1.54    11      2.23
#>  2 9         0     1.54    12      1.79
#>  3 9         0     1.54    13      1.9 
#>  4 9         0     1.54    14      2.12
#>  5 9         0     1.54    15      2.66
#>  6 45        1     1.16    11      1.12
#>  7 45        1     1.16    12      1.45
#>  8 45        1     1.16    13      1.45
#>  9 45        1     1.16    14      1.45
#> 10 45        1     1.16    15      1.99
#> # ℹ 70 more rows

To convert a person-period data set to person-level format use dplyr::pivot_wider():

pivot_wider(
  deviant_tolerance_pp,
  names_from = age,
  names_prefix = "tolerance_",
  values_from = tolerance
)
#> # A tibble: 16 × 8
#>    id     male exposure tolerance_11 tolerance_12 tolerance_13 tolerance_14
#>    <fct> <dbl>    <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
#>  1 9         0     1.54         2.23         1.79         1.9          2.12
#>  2 45        1     1.16         1.12         1.45         1.45         1.45
#>  3 268       1     0.9          1.45         1.34         1.99         1.79
#>  4 314       0     0.81         1.22         1.22         1.55         1.12
#>  5 442       0     1.13         1.45         1.99         1.45         1.67
#>  6 514       1     0.9          1.34         1.67         2.23         2.12
#>  7 569       0     1.99         1.79         1.9          1.9          1.99
#>  8 624       1     0.98         1.12         1.12         1.22         1.12
#>  9 723       0     0.81         1.22         1.34         1.12         1   
#> 10 918       0     1.21         1            1            1.22         1.99
#> 11 949       1     0.93         1.99         1.55         1.12         1.45
#> 12 978       1     1.59         1.22         1.34         2.12         3.46
#> 13 1105      1     1.38         1.34         1.9          1.99         1.9 
#> 14 1542      0     1.44         1.22         1.22         1.99         1.79
#> 15 1552      0     1.04         1            1.12         2.23         1.55
#> 16 1653      0     1.25         1.11         1.11         1.34         1.55
#> # ℹ 1 more variable: tolerance_15 <dbl>

Adding discrete time indicators to person-period data

To add discrete time indicators to a person-period data set first create a temporary copy of the time variable and a column of ones, then use dplyr::pivot_wider():

deviant_tolerance_pp |>
  mutate(
    temp_age = age,
    temp_dummy = 1
  ) |>
  pivot_wider(
    names_from = temp_age,
    names_prefix = "age_",
    values_from = temp_dummy,
    values_fill = 0
  )
#> # A tibble: 80 × 10
#>    id      age tolerance  male exposure age_11 age_12 age_13 age_14 age_15
#>    <fct> <dbl>     <dbl> <dbl>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 9        11      2.23     0     1.54      1      0      0      0      0
#>  2 9        12      1.79     0     1.54      0      1      0      0      0
#>  3 9        13      1.9      0     1.54      0      0      1      0      0
#>  4 9        14      2.12     0     1.54      0      0      0      1      0
#>  5 9        15      2.66     0     1.54      0      0      0      0      1
#>  6 45       11      1.12     1     1.16      1      0      0      0      0
#>  7 45       12      1.45     1     1.16      0      1      0      0      0
#>  8 45       13      1.45     1     1.16      0      0      1      0      0
#>  9 45       14      1.45     1     1.16      0      0      0      1      0
#> 10 45       15      1.99     1     1.16      0      0      0      0      1
#> # ℹ 70 more rows

Adding contiguous periods to person-level survival data

To add contiguous periods to person-level data use dplyr::reframe():

first_sex |>
  # In order to add the event indicator, the time variable needs a different
  # name in the person-level data from the name we want to use in `reframe()`.
  # This is a temporary variable so it doesn't matter what the name is.
  rename(grades = grade) |>
  group_by(id) |>
  reframe(
    grade = 1:max(grades),
    event = if_else(grade == grades & censor == 0, 1, 0),
    # To keep predictors from the person-level data, simply list them. If there
    # are many predictors it might be more convenient to use
    # `dplyr::left_join()` after `reframe()`.
    parental_transition,
    parental_antisociality
  )
#> # A tibble: 1,902 × 5
#>    id    grade event parental_transition parental_antisociality
#>    <fct> <int> <dbl>               <dbl>                  <dbl>
#>  1 1         1     0                   0                  1.98 
#>  2 1         2     0                   0                  1.98 
#>  3 1         3     0                   0                  1.98 
#>  4 1         4     0                   0                  1.98 
#>  5 1         5     0                   0                  1.98 
#>  6 1         6     0                   0                  1.98 
#>  7 1         7     0                   0                  1.98 
#>  8 1         8     0                   0                  1.98 
#>  9 1         9     1                   0                  1.98 
#> 10 2         1     0                   1                 -0.545
#> # ℹ 1,892 more rows