Compute User Sessions from Event Data

Processes a dataset to create user sessions based on time gaps, ordering columns, or actor groupings. It supports different ways to understand order in user behavior and provides flexibility when widening the data.

Usage

prepare_data(
  data,
  actor,
  time,
  action,
  order,
  time_threshold = 900,
  custom_format = NULL,
  is_unix_time = FALSE,
  unix_time_unit = "seconds",
  unused_fn = dplyr::first
)

Arguments

data: A data.frame or containing the action/event data.
actor: A character string giving the name of the column that represents a user/actor identifier. If not provided and neither time nor order is specified, the entire dataset is treated as a single session.
time: A character string giving the name of the column representing timestamps of the action events.
action: A character string giving the name of the column holding the information about the action taken.
order: A character string giving the name of a column with sequence numbers or non-unique orderable values that indicate order within an actor group, if not present it will be ordered with all the data if no actor is available, used when widening the data. If both actor and time are specified, then the sequence order should be specified such that it determines the order of events within actor and each session.
time_threshold: An integer specifying the time threshold in seconds for creating new time-based sessions. Defaults to 900 seconds.
custom_format: A character string giving the format used to parse the time column.
is_unix_time: A logical value indicating whether the time column is in Unix time. The default is FALSE.
unix_time_unit: A character string giving the Unix time unit when is_unix_time is TRUE. The default is "seconds". Valid options are "seconds", "milliseconds", or "microseconds".
unused_fn: How to handle extra columns when pivoting to wide format. See tidyr::pivot_wider(). The default is to keep all columns and to use the first value.

Value

A tna_data object, which is a list with the following elements:

long_data: The processed data in long format.
sequence_data: The processed data on the sequences in wide format, with actions/events as different variables structured with sequences.
meta_data: Other variables from the original data in wide format.
statistics: A list containing summary statistics: total sessions, total actions, unique users, time range (if applicable), and top sessions and user by activities.

Examples

data <- tibble::tibble(
  user = c("A", "A", "A", "B", "B", "C", "C", "C"),
  time = c(
    "2023-01-01 10:00:00", "2023-01-01 10:05:00",
    "2023-01-01 10:20:00", "2023-01-01 12:00:00",
    "2023-01-01 12:02:00", "2023-01-01 14:00:00",
    "2023-01-01 14:05:00", "2023-01-01 14:10:00"
  ),
  action = c(
    "view", "click", "add_cart", "view",
    "checkout", "view", "click", "share"
   )
)
results <- prepare_data(
  data, actor = "user", time = "time", action = "action"
)
#> ── Preparing Data ──────────────────────────────────────────────────────────────
#> ℹ Input data dimensions: 8 rows, 3 columns
#> ℹ First few time values: "2023-01-01 10:00:00", "2023-01-01 10:05:00", and
#>   "2023-01-01 10:20:00"
#> ℹ Number of values to parse: 8
#> ℹ Sample values: "2023-01-01 10:00:00", "2023-01-01 10:05:00", and "2023-01-01
#>   10:20:00"
#> ✔ Successfully parsed using format: "%Y-%m-%d %H:%M:%S"
#> ℹ Sample of parsed times: 2023-01-01 10:00:00, 2023-01-01 10:05:00, and
#>   2023-01-01 10:20:00
#> ℹ Time threshold for new session: 900 seconds
#> ℹ Total number of sessions: 3
#> ℹ Number of unique users: 3
#> ℹ Total number of actions: 8
#> ℹ Maximum sequence length: 3 actions
#> ℹ Time range: 2023-01-01 10:00:00 to 2023-01-01 14:10:00
#> ℹ Sessions per user:
#>   A: 1
#>   B: 1
#>   C: 1
#> ℹ Top 5 longest sessions:
#>   A session1: 3
#>   C session1: 3
#>   B session1: 2
print(results$sequence_data)
#> # A tibble: 3 × 3
#>   T1    T2       T3      
#>   <chr> <chr>    <chr>   
#> 1 view  click    add_cart
#> 2 view  checkout NA      
#> 3 view  click    share   
print(results$meta_data)
#> # A tibble: 3 × 5
#>   .session_id user  time                .standardized_time  .session_nr
#>   <chr>       <chr> <chr>               <dttm>                    <int>
#> 1 A session1  A     2023-01-01 10:00:00 2023-01-01 10:00:00           1
#> 2 B session1  B     2023-01-01 12:00:00 2023-01-01 12:00:00           1
#> 3 C session1  C     2023-01-01 14:00:00 2023-01-01 14:00:00           1
print(results$statistics)
#> $total_sessions
#> [1] 3
#> 
#> $total_actions
#> [1] 8
#> 
#> $max_sequence_length
#> [1] 3
#> 
#> $unique_users
#> [1] 3
#> 
#> $sessions_per_user
#> # A tibble: 3 × 2
#>   user  n_sessions
#>   <chr>      <int>
#> 1 A              1
#> 2 B              1
#> 3 C              1
#> 
#> $actions_per_session
#> # A tibble: 3 × 2
#>   .session_id n_actions
#>   <chr>           <int>
#> 1 A session1          3
#> 2 C session1          3
#> 3 B session1          2
#> 
#> $time_range
#> [1] "2023-01-01 10:00:00 UTC" "2023-01-01 14:10:00 UTC"
#> 

data_ordered <- tibble::tibble(
   user = c("A", "A", "A", "B", "B", "C", "C", "C"),
   order = c(1, 2, 3, 1, 2, 1, 2, 3),
   action = c(
     "view", "click", "add_cart", "view",
     "checkout", "view", "click", "share"
   )
)
results_ordered <- prepare_data(
  data_ordered, actor = "user", order = "order", action = "action"
)
#> ── Preparing Data ──────────────────────────────────────────────────────────────
#> ℹ Input data dimensions: 8 rows, 3 columns
#> ℹ Using provided `order` column to create sequences.
#> ℹ Total number of sessions: 3
#> ℹ Number of unique users: 3
#> ℹ Total number of actions: 8
#> ℹ Maximum sequence length: 3 actions
#> ℹ Sessions per user:
#>   A: 1
#>   B: 1
#>   C: 1
#> ℹ Top 5 longest sessions:
#>   A: 3
#>   C: 3
#>   B: 2
print(results_ordered$sequence_data)
#> # A tibble: 3 × 3
#>   T1    T2       T3      
#>   <chr> <chr>    <chr>   
#> 1 view  click    add_cart
#> 2 view  checkout NA      
#> 3 view  click    share   
print(results_ordered$meta_data)
#> # A tibble: 3 × 3
#>   .session_id user  order
#>   <chr>       <chr> <dbl>
#> 1 A           A         1
#> 2 B           B         1
#> 3 C           C         1
print(results_ordered$statistics)
#> $total_sessions
#> [1] 3
#> 
#> $total_actions
#> [1] 8
#> 
#> $max_sequence_length
#> [1] 3
#> 
#> $unique_users
#> [1] 3
#> 
#> $sessions_per_user
#> # A tibble: 3 × 2
#>   user  n_sessions
#>   <chr>      <int>
#> 1 A              1
#> 2 B              1
#> 3 C              1
#> 
#> $actions_per_session
#> # A tibble: 3 × 2
#>   .session_id n_actions
#>   <chr>           <int>
#> 1 A                   3
#> 2 C                   3
#> 3 B                   2
#> 

data_single_session <- tibble::tibble(
  action = c(
    "view", "click", "add_cart", "view",
    "checkout", "view", "click", "share"
   )
)
results_single <- prepare_data(data_single_session, action = "action")
#> ── Preparing Data ──────────────────────────────────────────────────────────────
#> ℹ Input data dimensions: 8 rows, 1 columns
#> ℹ No `time` or `order` column provided. Treating the entire dataset as one
#>   session.
#> ℹ Total number of sessions: 1
#> ℹ Total number of actions: 8
#> ℹ Maximum sequence length: 8 actions
#> ℹ Top 5 longest sessions:
#>   session: 8
print(results_single$sequence_data)
#> # A tibble: 1 × 8
#>   T1    T2    T3       T4    T5       T6    T7    T8   
#>   <chr> <chr> <chr>    <chr> <chr>    <chr> <chr> <chr>
#> 1 view  click add_cart view  checkout view  click share
print(results_single$meta_data)
#> # A tibble: 1 × 1
#>   .session_id
#>   <chr>      
#> 1 session    
print(results_single$statistics)
#> $total_sessions
#> [1] 1
#> 
#> $total_actions
#> [1] 8
#> 
#> $max_sequence_length
#> [1] 8
#> 
#> $actions_per_session
#> # A tibble: 1 × 2
#>   .session_id n_actions
#>   <chr>           <int>
#> 1 session             8
#>

Usage

Arguments

Value

See also

Examples