A number of algorithms can be distinguished that are basic and underlie almost every line of programs written in high-level languages. It is good to have at hand the classic multivolume work of Donald Knuth "The Art of Computer Programming" , where many basic algorithms are analyzed in detail. But reading and assimilating everything is a task that requires a lot of effort and time, which must somehow be motivated.
Many may assume that it was necessary to know the nuances 50 years ago, but now you can use ready-made packages and functions and not dive into details. However, this is not the case. Likewise, no one has canceled the importance of understanding the representation of methods for storing data in memory and processing it in the processor.
. . .
, . , , .
, , :
case_id
โ /;record
โ ;start
โ .
library(tidyverse) library(data.table) library(rTRNG)
. . "". . 10^5-10^n
, .
#
nn <- 100
#
records <- c("first", "one and a half", "second", "third", "fourth",
"fifth", "sixth")
#
df <- tibble(case_id = 1:nn, recs = list(records)) %>%
unnest(recs)
dt <- as.data.table(df)[, case_id := as.numeric(case_id)]
#
setkey(dt, case_id)
head(df, 10)
# A tibble: 10 x 2 case_id recs <int> <chr> 1 1 first 2 1 one and a half 3 1 second 4 1 third 5 1 fourth 6 1 fifth 7 1 sixth 8 2 first 9 2 one and a half 10 2 second
โ . [0; 1]
. unixtimestamp , . . , , .
1.
. , .
f1 <- function(df) {
df %>%
group_by(case_id) %>%
mutate(t_idx = sort(runif(n(), 0, 1))) %>%
ungroup()
}
. , . , 100 .
median `itr/sec` mem_alloc 15.38ms 63.2 284.9KB
, ?
1+1/2. +
rTRNG
. , , . :
f1_5 <- function(df) {
df %>%
group_by(case_id) %>%
mutate(t_idx = sort(runif_trng(n(), 0, 1))) %>%
ungroup()
}
median `itr/sec` mem_alloc 29.34ms 29.5 284.9KB
. ? . , tidyverse
data.table
, .
2. , data.table
โ , .
f2 <- function(dt) {
# , `case_id``
#
vec <- dt[, t_idx := runif_trng(.N, 0, 1)][order(case_id, t_idx), t_idx]
#
dt[, t_idx := vec]
}
, 15-20 .
median `itr/sec` mem_alloc 1.69ms 554. 109KB
? ?
3. ,
, , , by
, . . โ , . . [0; 1]
, . case_id
, โ . case_id
,
f3 <- function(dt) {
# , case_id, ,
# [0, 1], ( )
# 0 1
#
dt[, t_idx := sort(case_id + runif_trng(.N, 0, 1, parallelGrain = 10000L)) - case_id]
}
2 , , .
median `itr/sec` mem_alloc 826.7us 1013. 54.3KB
3+1/2. , , set
? , . , NSE . , .
f3_5 <- function(dt) {
set(dt, j = "t_idx",
value = sort(dt$case_id + runif(nrow(dt), 0, 1)) - dt$case_id)
}
5 , 4
median `itr/sec` mem_alloc 161.5us 5519. 16.3KB
.
bench::mark( f1(df), f1_5(df), f2(dt), f3(dt), f3_5(dt), check = FALSE )
expression min median `itr/sec` mem_alloc <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> 1 f1(df) 14.3ms 15.38ms 63.2 284.9KB 2 f1_5(df) 24.43ms 29.34ms 29.5 284.9KB 3 f2(dt) 1.55ms 1.69ms 554. 109KB 4 f3(dt) 722us 826.7us 1013. 54.3KB 5 f3_5(dt) 142.5us 161.5us 5519. 16.3KB
90 , 18 . , .
. (" "). .
, . .
1.
โ , , , . , , , , , . . :
# C -- case_id, record, start, finish
# , finish > start
# 1, 2 start_2 > finish_1
dt[, t_idx := NULL] #
f1 <- function(df) {
df %>%
group_by(case_id) %>%
mutate(ts_idx = sort(runif(n(), 0, 1))) %>%
ungroup() %>%
# ,
# NaN ( max < min),
# 1 , NA 1
mutate(tf_idx = {lead(ts_idx, default = 1) %>% if_else(. > ts_idx, ., 1)}) %>%
mutate(tf_idx = map2_dbl(ts_idx, tf_idx, ~runif(1, .x, .y)))
}
, , , .
median `itr/sec` mem_alloc 28.16ms 30.7 2.06MB
2. ,
. 2 , . ! โ , . .
f2 <- function(dt){
dt[, c("ts_idx", "tf_idx") := {
# vector recycling
x <- case_id + runif(2 * .N, 0, 1);
m <- matrix(sort(x), ncol = 2, byrow = TRUE) - case_id;
list(m[, 1], m[, 2])
}]
}
30 ! .
median `itr/sec` mem_alloc 1.04ms 733. 74.38KB
2+1/2. , , set
f2_5 <- function(dt){
x <- dt$case_id + runif(2 * nrow(dt), 0, 1)
m <- matrix(sort(x), ncol = 2, byrow = TRUE) - dt$case_id
set(dt, j = "ts_idx", value = m[, 1])
set(dt, j = "tf_idx", value = m[, 2])
}
. 4 .
median `itr/sec` mem_alloc 278.1us 2781. 57.55KB
.
bench::mark( f1(df), f2(dt), f2_5(dt), check = FALSE )
median `itr/sec` mem_alloc 28.16ms 30.7 2.06MB 1.04ms 733. 74.38KB 278.1us 2781. 57.55KB
90 , 35 .
, . , , , . , , . โ , . . , .
, BigData , , - .
P.S.
, . -
โ . .
- .
โ ยซ Rยป.