Impute MV in data.
impute_mv(
object,
sample_id,
method = c("knn", "rf", "mean", "median", "zero", "minimum", "bpca", "svdImpute",
"ppca"),
k = 10,
rowmax = 0.5,
colmax = 0.8,
maxp = 1500,
rng.seed = 362436069,
maxiter = 10,
ntree = 100,
decreasing = FALSE,
nPcs = 2,
maxSteps = 100,
threshold = 1e-04,
...
)
A mass_dataset object.
which samples you want to impute missing value? It is a index or character vector (sample_id)
Imputation method. It contains "knn", "rf" (missForest), "mean", "median", "zero", "minium", "bpca" (BPCA), "svd" (SVD) and "ppca" (PPCA). Default is "knn". The detial of this method can be find in detail and reference paperes.
See ?impute.knn
See ?impute.knn
See ?impute.knn
See ?impute.knn
See ?impute.knn
See ?missForest
See ?missForest
See ?missForest
See ?bpca
See ?bpca
See ?bpca
Other arguments.
A new mass_dataset object.
library(massdataset)
data("expression_data")
data("sample_info")
data("variable_info")
object =
create_mass_dataset(
expression_data = expression_data,
sample_info = sample_info,
variable_info = variable_info
)
object
#> --------------------
#> massdataset version: 1.0.12
#> --------------------
#> 1.expression_data:[ 1000 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> 1 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-08-07 20:07:30
get_mv_number(object)
#> [1] 3829
massdataset::get_mv_number(object, by = "sample")
#> Blank_3 Blank_4 QC_1 QC_2 PS4P1 PS4P2 PS4P3 PS4P4
#> 682 702 397 381 424 427 405 411
###remove variables who have mv in more than 20% QC samples
qc_id =
object %>%
activate_mass_dataset(what = "sample_info") %>%
filter(class == "QC") %>%
pull(sample_id)
subject_id =
object %>%
activate_mass_dataset(what = "sample_info") %>%
filter(class == "Subject") %>%
pull(sample_id)
object =
object %>%
mutate_variable_na_freq(according_to_samples = qc_id) %>%
mutate_variable_na_freq(according_to_samples = subject_id) %>%
activate_mass_dataset(what = "variable_info") %>%
filter(na_freq < 0.2 & na_freq.1 < 0.5)
###remove samples with MV > 50% except Blank samples
object =
filter_samples(
object = object,
flist = function(x) {
sum(is.na(x)) / nrow(object) < 0.5
},
apply_to = c(qc_id, subject_id),
prune = TRUE
)
blank_id =
object %>%
activate_mass_dataset(what = "sample_info") %>%
filter(class == "Blank") %>%
pull(sample_id)
object1 =
impute_mv(object = object,
sample_id = blank_id,
method = "zero")
object1 %>%
activate_mass_dataset(what = "expression_data") %>%
select(dplyr::contains("Blank")) %>%
extract_expression_data() %>%
head()
#> Blank_3 Blank_4
#> M136T55_2_POS 0 0
#> M79T35_POS 0 0
#> M307T548_POS 0 0
#> M349T47_POS 0 0
#> M299T359_POS 0 0
#> M344T471_POS 0 0
object2 =
impute_mv(object = object,
sample_id = subject_id,
method = "knn")
object2 %>%
activate_mass_dataset(what = "sample_info") %>%
filter(class == "Subject") %>%
extract_expression_data() %>%
head()
#> PS4P1 PS4P2 PS4P3 PS4P4
#> M136T55_2_POS 1494436.1 3496912.1 1959178.8 1005418.8
#> M79T35_POS 2471336.1 3333582.7 2734243.8 3361452.3
#> M307T548_POS 288590.2 137297.5 231279.2 271318.3
#> M349T47_POS 5141073.2 8424315.6 7896633.3 6441449.0
#> M299T359_POS 1401632.7 4055989.5 1577496.3 3668817.5
#> M344T471_POS 334718.8 276913.4 304611.5 450460.3