Processes a dataset to create user sessions based on time gaps, ordering columns, or actor groupings. It supports different ways to understand order in user behavior and provides flexibility when widening the data.
Usage
prepare_data(
data,
actor,
time,
action,
order,
time_threshold = 900,
custom_format = NULL,
is_unix_time = FALSE,
unix_time_unit = "seconds",
unused_fn = dplyr::first
)
Arguments
- data
A
data.frame
or containing the action/event data.- actor
A
character
string giving the name of the column that represents a user/actor identifier. If not provided and neithertime
nororder
is specified, the entire dataset is treated as a single session.- time
A
character
string giving the name of the column representing timestamps of the action events.- action
A
character
string giving the name of the column holding the information about the action taken.- order
A
character
string giving the name of a column with sequence numbers or non-unique orderable values that indicate order within anactor
group, if not present it will be ordered with all the data if noactor
is available, used when widening the data. If bothactor
andtime
are specified, then the sequence order should be specified such that it determines the order of events withinactor
and each session.- time_threshold
An
integer
specifying the time threshold in seconds for creating new time-based sessions. Defaults to 900 seconds.- custom_format
A
character
string giving the format used to parse thetime
column.- is_unix_time
A
logical
value indicating whether thetime
column is in Unix time. The default isFALSE
.- unix_time_unit
A
character
string giving the Unix time unit whenis_unix_time
isTRUE
. The default is"seconds"
. Valid options are"seconds"
,"milliseconds"
, or"microseconds"
.- unused_fn
How to handle extra columns when pivoting to wide format. See
tidyr::pivot_wider()
. The default is to keep all columns and to use the first value.
Value
A tna_data
object, which is a list
with the following elements:
long_data
: The processed data in long format.sequence_data
: The processed data on the sequences in wide format, with actions/events as different variables structured with sequences.meta_data
: Other variables from the original data in wide format.statistics
: Alist
containing summary statistics: total sessions, total actions, unique users, time range (if applicable), and top sessions and user by activities.
See also
Basic functions
build_model()
,
hist.group_tna()
,
hist.tna()
,
import_data()
,
plot.group_tna()
,
plot.tna()
,
plot_frequencies()
,
plot_frequencies.group_tna()
,
plot_mosaic()
,
plot_mosaic.group_tna()
,
plot_mosaic.tna_data()
,
print.group_tna()
,
print.summary.group_tna()
,
print.summary.tna()
,
print.tna()
,
print.tna_data()
,
simulate.tna()
,
summary.group_tna()
,
summary.tna()
,
tna-package
Examples
results <- prepare_data(
group_regulation_long, actor = "Actor", time = "Time", action = "Action"
)
#> ── Preparing Data ──────────────────────────────────────────────────────────────
#> ℹ Input data dimensions: 27533 rows, 6 columns
#> ℹ First few time values: 2025-01-01 08:27:07.712698, 2025-01-01
#> 08:35:20.712698, and 2025-01-01 08:42:18.712698
#> ℹ Number of values to parse: 27533
#> ℹ Sample values: 2025-01-01 08:27:07.712698, 2025-01-01 08:35:20.712698, and
#> 2025-01-01 08:42:18.712698
#> ℹ Sample of parsed times: 2025-01-01 08:27:07.712698, 2025-01-01
#> 08:35:20.712698, and 2025-01-01 08:42:18.712698
#> ℹ Time threshold for new session: 900 seconds
#> ℹ Total number of sessions: 2000
#> ℹ Number of unique users: 2000
#> ℹ Total number of actions: 27533
#> ℹ Maximum sequence length: 26 actions
#> ℹ Time range: 2025-01-01 08:01:16.009382 to 2025-01-01 13:03:20.238288
print(results$sequence_data)
#> # A tibble: 2,000 × 26
#> T1 T2 T3 T4 T5 T6 T7 T8 T9 T10 T11 T12 T13
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 cohe… cons… disc… synt… adapt cons… plan cons… NA NA NA NA NA
#> 2 emot… cohe… disc… synt… NA NA NA NA NA NA NA NA NA
#> 3 plan cons… plan NA NA NA NA NA NA NA NA NA NA
#> 4 disc… disc… cons… plan cohe… cons… disc… cons… plan plan NA NA NA
#> 5 cohe… cons… plan plan moni… plan cons… disc… cons… plan plan cohe… cons…
#> 6 disc… adapt cohe… cons… disc… emot… cohe… core… disc… disc… adapt NA NA
#> 7 disc… emot… cohe… cons… core… core… plan plan cons… core… cons… disc… disc…
#> 8 cohe… plan cons… plan cons… disc… disc… synt… cons… disc… synt… adapt cons…
#> 9 emot… cohe… emot… plan moni… disc… emot… cons… moni… disc… synt… core… cons…
#> 10 emot… cohe… cons… plan plan plan plan emot… plan NA NA NA NA
#> # ℹ 1,990 more rows
#> # ℹ 13 more variables: T14 <chr>, T15 <chr>, T16 <chr>, T17 <chr>, T18 <chr>,
#> # T19 <chr>, T20 <chr>, T21 <chr>, T22 <chr>, T23 <chr>, T24 <chr>,
#> # T25 <chr>, T26 <chr>
print(results$meta_data)
#> # A tibble: 2,000 × 8
#> .session_id Actor Achiever Group Course Time
#> <chr> <int> <chr> <dbl> <chr> <dttm>
#> 1 1 session1 1 High 1 A 2025-01-01 08:27:07
#> 2 10 session1 10 High 1 A 2025-01-01 08:23:45
#> 3 100 session1 100 High 10 A 2025-01-01 10:11:50
#> 4 1000 session1 1000 High 100 B 2025-01-01 09:12:00
#> 5 1001 session1 1001 Low 101 B 2025-01-01 09:18:40
#> 6 1002 session1 1002 Low 101 B 2025-01-01 09:18:53
#> 7 1003 session1 1003 Low 101 B 2025-01-01 09:18:05
#> 8 1004 session1 1004 Low 101 B 2025-01-01 09:22:26
#> 9 1005 session1 1005 Low 101 B 2025-01-01 09:22:31
#> 10 1006 session1 1006 Low 101 B 2025-01-01 09:15:23
#> # ℹ 1,990 more rows
#> # ℹ 2 more variables: .standardized_time <dttm>, .session_nr <int>
print(results$statistics)
#> $total_sessions
#> [1] 2000
#>
#> $total_actions
#> [1] 27533
#>
#> $max_sequence_length
#> [1] 26
#>
#> $unique_users
#> [1] 2000
#>
#> $sessions_per_user
#> # A tibble: 2,000 × 2
#> Actor n_sessions
#> <int> <int>
#> 1 1 1
#> 2 2 1
#> 3 3 1
#> 4 4 1
#> 5 5 1
#> 6 6 1
#> 7 7 1
#> 8 8 1
#> 9 9 1
#> 10 10 1
#> # ℹ 1,990 more rows
#>
#> $actions_per_session
#> # A tibble: 2,000 × 2
#> .session_id n_actions
#> <chr> <int>
#> 1 1010 session1 26
#> 2 1015 session1 26
#> 3 1030 session1 26
#> 4 1092 session1 26
#> 5 1106 session1 26
#> 6 1107 session1 26
#> 7 1153 session1 26
#> 8 1184 session1 26
#> 9 1209 session1 26
#> 10 1267 session1 26
#> # ℹ 1,990 more rows
#>
#> $time_range
#> [1] "2025-01-01 08:01:16 UTC" "2025-01-01 13:03:20 UTC"
#>
data_ordered <- tibble::tibble(
user = c("A", "A", "A", "B", "B", "C", "C", "C"),
order = c(1, 2, 3, 1, 2, 1, 2, 3),
action = c(
"view", "click", "add_cart", "view",
"checkout", "view", "click", "share"
)
)
results_ordered <- prepare_data(
data_ordered, actor = "user", order = "order", action = "action"
)
#> ── Preparing Data ──────────────────────────────────────────────────────────────
#> ℹ Input data dimensions: 8 rows, 3 columns
#> ℹ Using provided `order` column to create sequences.
#> ℹ Total number of sessions: 3
#> ℹ Number of unique users: 3
#> ℹ Total number of actions: 8
#> ℹ Maximum sequence length: 3 actions
print(results_ordered$sequence_data)
#> # A tibble: 3 × 3
#> T1 T2 T3
#> <chr> <chr> <chr>
#> 1 view click add_cart
#> 2 view checkout NA
#> 3 view click share
print(results_ordered$meta_data)
#> # A tibble: 3 × 3
#> .session_id user order
#> <chr> <chr> <dbl>
#> 1 A A 1
#> 2 B B 1
#> 3 C C 1
print(results_ordered$statistics)
#> $total_sessions
#> [1] 3
#>
#> $total_actions
#> [1] 8
#>
#> $max_sequence_length
#> [1] 3
#>
#> $unique_users
#> [1] 3
#>
#> $sessions_per_user
#> # A tibble: 3 × 2
#> user n_sessions
#> <chr> <int>
#> 1 A 1
#> 2 B 1
#> 3 C 1
#>
#> $actions_per_session
#> # A tibble: 3 × 2
#> .session_id n_actions
#> <chr> <int>
#> 1 A 3
#> 2 C 3
#> 3 B 2
#>
data_single_session <- tibble::tibble(
action = c(
"view", "click", "add_cart", "view",
"checkout", "view", "click", "share"
)
)
results_single <- prepare_data(data_single_session, action = "action")
#> ── Preparing Data ──────────────────────────────────────────────────────────────
#> ℹ Input data dimensions: 8 rows, 1 columns
#> ℹ No `time` or `order` column provided. Treating the entire dataset as one
#> session.
#> ℹ Total number of sessions: 1
#> ℹ Total number of actions: 8
#> ℹ Maximum sequence length: 8 actions
print(results_single$sequence_data)
#> # A tibble: 1 × 8
#> T1 T2 T3 T4 T5 T6 T7 T8
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 view click add_cart view checkout view click share
print(results_single$meta_data)
#> # A tibble: 1 × 1
#> .session_id
#> <chr>
#> 1 session
print(results_single$statistics)
#> $total_sessions
#> [1] 1
#>
#> $total_actions
#> [1] 8
#>
#> $max_sequence_length
#> [1] 8
#>
#> $actions_per_session
#> # A tibble: 1 × 2
#> .session_id n_actions
#> <chr> <int>
#> 1 session 8
#>