A number of algorithms can be distinguished that are basic and underlie almost every line of programs written in high-level languages. It is good to have at hand the classic multivolume work of Donald Knuth "The Art of Computer Programming" , where many basic algorithms are analyzed in detail. But reading and assimilating everything is a task that requires a lot of effort and time, which must somehow be motivated.

Many may assume that it was necessary to know the nuances 50 years ago, but now you can use ready-made packages and functions and not dive into details. However, this is not the case. Likewise, no one has canceled the importance of understanding the representation of methods for storing data in memory and processing it in the processor.

  • case_id

    โ€” /;
  • record

    โ€” ;
  • start

    โ€” .


nn <- 100
records <- c("first", "one and a half", "second", "third", "fourth", 
             "fifth", "sixth")

df <- tibble(case_id = 1:nn, recs = list(records)) %>%

dt <-[, case_id := as.numeric(case_id)]
setkey(dt, case_id)

head(df, 10)

  # A tibble: 10 x 2
     case_id recs          
       <int> <chr>         
   1       1 first         
   2       1 one and a half
   3       1 second        
   4       1 third         
   5       1 fourth        
   6       1 fifth         
   7       1 sixth         
   8       2 first         
   9       2 one and a half
  10       2 second  

f1 <- function(df) {
  df %>%
    group_by(case_id) %>%
    mutate(t_idx = sort(runif(n(), 0, 1))) %>%

  median `itr/sec` mem_alloc
 15.38ms      63.2   284.9KB

f1_5 <- function(df) {
  df %>%
    group_by(case_id) %>%
    mutate(t_idx = sort(runif_trng(n(), 0, 1))) %>%

  median `itr/sec` mem_alloc
 29.34ms      29.5   284.9KB

f2 <- function(dt) {
  #    ,       `case_id``
  vec <- dt[, t_idx := runif_trng(.N, 0, 1)][order(case_id, t_idx), t_idx]
  dt[, t_idx := vec]

  median `itr/sec` mem_alloc 
  1.69ms     554.      109KB 

f3 <- function(dt) {
  #  ,     case_id,   ,    
  #       [0, 1],       ( )
  #       0  1     
  dt[, t_idx := sort(case_id + runif_trng(.N, 0, 1, parallelGrain = 10000L)) - case_id]

  median `itr/sec` mem_alloc
 826.7us    1013.     54.3KB

f3_5 <- function(dt) {
  set(dt, j = "t_idx", 
      value = sort(dt$case_id + runif(nrow(dt), 0, 1)) - dt$case_id)

  median `itr/sec` mem_alloc
 161.5us    5519.     16.3KB


  check = FALSE

  expression      min   median `itr/sec` mem_alloc
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>
1 f1(df)       14.3ms  15.38ms      63.2   284.9KB
2 f1_5(df)    24.43ms  29.34ms      29.5   284.9KB
3 f2(dt)       1.55ms   1.69ms     554.      109KB
4 f3(dt)        722us  826.7us    1013.     54.3KB
5 f3_5(dt)    142.5us  161.5us    5519.     16.3KB

# C   -- case_id, record, start, finish
#    ,     finish > start 
#      1, 2    start_2 > finish_1 

dt[, t_idx := NULL] #    

f1 <- function(df) {
  df %>%
    group_by(case_id) %>%
    mutate(ts_idx = sort(runif(n(), 0, 1))) %>%
    ungroup() %>%
    #    ,       
    #   NaN     (  max < min), 
    #    1   , NA       1
    mutate(tf_idx = {lead(ts_idx, default = 1) %>% if_else(. > ts_idx, ., 1)}) %>%
    mutate(tf_idx = map2_dbl(ts_idx, tf_idx, ~runif(1, .x, .y)))

  median `itr/sec` mem_alloc 
 28.16ms      30.7    2.06MB 

f2 <- function(dt){
  dt[, c("ts_idx", "tf_idx") := {
    #   vector recycling
    x <- case_id + runif(2 * .N, 0, 1);
    m <- matrix(sort(x), ncol = 2, byrow = TRUE) - case_id;
    list(m[, 1], m[, 2])

  median `itr/sec` mem_alloc 
  1.04ms     733.    74.38KB 

f2_5 <- function(dt){
  x <- dt$case_id + runif(2 * nrow(dt), 0, 1)
  m <- matrix(sort(x), ncol = 2, byrow = TRUE) - dt$case_id
  set(dt, j = "ts_idx", value = m[, 1])
  set(dt, j = "tf_idx", value = m[, 2])

  median `itr/sec` mem_alloc 
 278.1us    2781.    57.55KB 


  check = FALSE

  median `itr/sec` mem_alloc 
 28.16ms      30.7    2.06MB 
  1.04ms     733.    74.38KB 
 278.1us    2781.    57.55KB 

