I have to replace the missing value to maximum (Value) by ID. How to do in R
ID Value
1 NA
5 15
8 16
6 8
7 65
8 NA
5 25
1 62
6 14
7 NA
9 11
8 12
9 36
1 26
4 13
I have to replace the missing value to maximum (Value) by ID. How to do in R
ID Value
1 NA
5 15
8 16
6 8
7 65
8 NA
5 25
1 62
6 14
7 NA
9 11
8 12
9 36
1 26
4 13
I would first precompute the max values using a call to aggregate(), and also precompute which rows of the data.frame have an NA value. Then you can match the IDs into the aggregation table to extract the corresponding max value.
maxes <- aggregate(Value~ID,df,max,na.rm=T);
nas <- which(is.na(df$Value));
df$Value[nas] <- maxes$Value[match(df$ID[nas],maxes$ID)];
df;
## ID Value
## 1 1 62
## 2 5 15
## 3 8 16
## 4 6 8
## 5 7 65
## 6 8 16
## 7 5 25
## 8 1 62
## 9 6 14
## 10 7 65
## 11 9 11
## 12 8 12
## 13 9 36
## 14 1 26
## 15 4 13
Alternative, using ave():
df$Value <- ave(df$Value,df$ID,FUN=function(x) { x[is.na(x)] <- max(x,na.rm=T); x; });
df;
## ID Value
## 1 1 62
## 2 5 15
## 3 8 16
## 4 6 8
## 5 7 65
## 6 8 16
## 7 5 25
## 8 1 62
## 9 6 14
## 10 7 65
## 11 9 11
## 12 8 12
## 13 9 36
## 14 1 26
## 15 4 13
Data
df <- data.frame(ID=c(1L,5L,8L,6L,7L,8L,5L,1L,6L,7L,9L,8L,9L,1L,4L),Value=c(NA,15L,16L,8L,
65L,NA,25L,62L,14L,NA,11L,12L,36L,26L,13L));
Notes:
bgoldst2(), rafa(), and akrun() to guard against the case of zero non-NAs in a group; otherwise, max(...,na.rm=T) returns -Inf which can mess up subsequent operations. I used the same algorithm for all three guards. thierry() and bgoldst1() did not have to be modified.res shows the two parameters, the mean run-times for all solutions, and the unit chosen by the microbenchmark summarization algorithm.library(microbenchmark);
library(dplyr);
library(data.table);
library(zoo);
thierry <- function(df) df %>% group_by(ID) %>% mutate(Value = ifelse(is.na(Value), max(Value, na.rm = TRUE), Value));
bgoldst1 <- function(df) { maxes <- aggregate(Value~ID,df,max,na.rm=T); nas <- which(is.na(df$Value)); df$Value[nas] <- maxes$Value[match(df$ID[nas],maxes$ID)]; df; };
bgoldst2 <- function(df) { df$Value <- ave(df$Value,df$ID,FUN=function(x) { nas <- is.na(x); if (any(!nas) && any(nas)) x[nas] <- max(x,na.rm=T); x; }); df; };
rafa <- function(dt) dt[ , Value := { nas <- is.na(Value); if (any(!nas) && any(nas)) ifelse( nas, max(Value, na.rm=T), Value) else Value; }, by = ID];
akrun <- function(dt) dt[, Value := { nas <- is.na(Value); if (any(!nas) && any(nas)) na.aggregate(Value, FUN = max) else Value; }, ID];
## small scale (OP's sample input)
df <- data.frame(ID=c(1L,5L,8L,6L,7L,8L,5L,1L,6L,7L,9L,8L,9L,1L,4L),Value=c(NA,15L,16L,8L,65L,NA,25L,62L,14L,NA,11L,12L,36L,26L,13L));
dt <- as.data.table(df);
ex <- as.data.frame(thierry(copy(df)));
identical(ex,bgoldst1(copy(df)));
identical(ex,bgoldst2(copy(df)));
identical(ex,as.data.frame(rafa(copy(dt))));
identical(ex,as.data.frame(akrun(copy(dt))));
microbenchmark(thierry(copy(df)),bgoldst1(copy(df)),bgoldst2(copy(df)),rafa(copy(dt)),akrun(copy(dt)));
## Unit: microseconds
## expr min lq mean median uq max neval
## thierry(copy(df)) 955.804 989.1610 1043.2847 1004.984 1044.542 2852.016 100
## bgoldst1(copy(df)) 953.238 1005.1985 1069.6281 1039.410 1075.760 2968.337 100
## bgoldst2(copy(df)) 160.798 181.9665 196.0281 192.872 207.412 246.329 100
## rafa(copy(dt)) 947.679 1006.6945 1056.9396 1033.637 1055.874 2943.105 100
## akrun(copy(dt)) 1327.862 1384.5255 1496.1259 1415.530 1445.894 3969.899 100
## large scale, 3 group sizes crossed with 4 NA densities
NV <- 1e5L;
NIs <- c(10L,1e3L,3e4L);
probNAs <- c(1e-3,0.05,0.4,0.95);
res <- expand.grid(NI=NIs,probNA=probNAs);
system.time({
for (ri in seq_len(nrow(res))) {
NI <- res$NI[ri];
probNA <- res$probNA[ri];
df <- data.frame(ID=sample(seq_len(NI),NV,T),Value=sample(c(NA,1:99),NV,T,c(probNA,rep((1-probNA)/99,99L))));
dt <- as.data.table(df);
ex <- as.data.frame(thierry(copy(df)));
if (!all(c(
identical(ex,bgoldst1(copy(df))),
identical(ex,bgoldst2(copy(df))),
identical(ex,as.data.frame(rafa(copy(dt)))),
identical(ex,as.data.frame(akrun(copy(dt))))
))) stop('non-identical failure.');
bm <- summary(microbenchmark(thierry(copy(df)),bgoldst1(copy(df)),bgoldst2(copy(df)),rafa(copy(dt)),akrun(copy(dt)),times=5L));
nms <- sub('\\(.*','',as.character(bm$expr));
for (nm in nms) if (!nm%in%names(res)) res[[nm]] <- NA_real_;
if (!'unit'%in%names(res)) res$unit <- NA_character_;
res[ri,nms] <- bm$mean;
res$unit[ri] <- attr(bm,'unit');
}; ## end for
});
## user system elapsed
## 73.18 0.00 73.37
res;
## NI probNA thierry bgoldst1 bgoldst2 rafa akrun unit
## 1 10 0.001 7.850589 138.77128 14.867427 7.071150 8.023874 milliseconds
## 2 1000 0.001 40.318311 177.26223 9.868853 6.389129 18.054122 milliseconds
## 3 30000 0.001 813.204627 619.16166 125.274735 57.301590 74.732023 milliseconds
## 4 10 0.050 9.387743 139.41686 15.032158 8.479837 6.933616 milliseconds
## 5 1000 0.050 43.223697 156.79871 23.377797 20.550586 145.632279 milliseconds
## 6 30000 0.050 822.338773 677.81813 129.268155 114.585475 656.468438 milliseconds
## 7 10 0.400 15.955374 110.20717 9.785802 11.832889 10.511871 milliseconds
## 8 1000 0.400 55.858348 115.93900 14.441228 22.525058 142.740834 milliseconds
## 9 30000 0.400 853.571520 521.19690 147.925864 208.278328 2518.672465 milliseconds
## 10 10 0.950 9.768268 43.98346 5.921021 9.895623 8.571868 milliseconds
## 11 1000 0.950 49.228024 63.72596 13.702929 22.152230 143.606916 milliseconds
## 12 30000 0.950 822.033257 103.91700 113.398739 86.240922 630.982913 milliseconds
library(dplyr)
dataset %>%
group_by(ID) %>%
mutate(
Value = ifelse(
is.na(Value),
max(Value, na.rm = TRUE),
Value
)
)
A simple and fast solution using data.table. Thanks @bgoldst for the tip of including na.rm=T.
library(data.table)
setDT(df)[ , Value := ifelse( is.na(Value), max(Value, na.rm=T), Value), by = ID]
We can use na.aggregate with data.table
library(data.table)
library(zoo)
setDT(df)[, Value := na.aggregate(Value, FUN = max) , by = ID]
df
# ID Value
# 1: 1 62
# 2: 5 15
# 3: 8 16
# 4: 6 8
# 5: 7 65
# 6: 8 16
# 7: 5 25
# 8: 1 62
# 9: 6 14
#10: 7 65
#11: 9 11
#12: 8 12
#13: 9 36
#14: 1 26
#15: 4 13