
Sample summary
sample_summary.Rmd
Create Metaboprep object
library(metaboprep)
# import data
data <- read.csv(system.file("extdata", "dummy_data.csv", package = "metaboprep"), header=T, row.names = 1) |> as.matrix()
samples <- read.csv(system.file("extdata", "dummy_samples.csv", package = "metaboprep"), header=T, row.names = 1)
features <- read.csv(system.file("extdata", "dummy_features.csv", package = "metaboprep"), header=T, row.names = 1)
# create object
m <- Metaboprep(data = data, samples = samples, features = features)
# print
m
#> <metaboprep::Metaboprep>
#> @ data : num [1:100, 1:20, 1] 0.755887 0.662386 0.444527 0.627146 0.000465 ...
#> .. - attr(*, "dimnames")=List of 3
#> .. ..$ : chr [1:100] "id_100" "id_99" "id_98" "id_97" ...
#> .. ..$ : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#> .. ..$ : chr "input"
#> @ samples :'data.frame': 100 obs. of 5 variables:
#> .. $ sample_id: chr "id_100" "id_99" "id_98" "id_97" ...
#> .. $ age : int 29 47 65 57 52 40 42 63 49 42 ...
#> .. $ sex : chr "male" "male" "female" "female" ...
#> .. $ pos : chr "batch2" "batch1" "batch2" "batch1" ...
#> .. $ neg : chr "batch2" "batch2" "batch2" "batch1" ...
#> @ features :'data.frame': 20 obs. of 5 variables:
#> .. $ feature_id : chr "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#> .. $ platform : chr "neg" "neg" "neg" "pos" ...
#> .. $ pathway : logi NA NA NA NA NA NA ...
#> .. $ derived_feature : logi TRUE FALSE FALSE FALSE FALSE FALSE ...
#> .. $ xenobiotic_feature: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
#> @ exclusions :List of 2
#> .. $ samples :List of 5
#> .. ..$ user_excluded : chr(0)
#> .. ..$ extreme_sample_missingness : chr(0)
#> .. ..$ user_defined_sample_missingness : chr(0)
#> .. ..$ user_defined_sample_totalpeakarea: chr(0)
#> .. ..$ user_defined_sample_pca_outlier : chr(0)
#> .. $ features:List of 3
#> .. ..$ user_excluded : chr(0)
#> .. ..$ extreme_feature_missingness : chr(0)
#> .. ..$ user_defined_feature_missingness: chr(0)
#> @ feature_summary: num[0 , 0 , 0 ]
#> @ sample_summary : num[0 , 0 , 0 ]
Run sample summary
sample_summ <- sample_summary(metaboprep = m,
source_layer = "input",
outlier_udist = 1.0,
output = "data.frame")
sample_id | missingness | tpa_total | tpa_complete_features | outlier_count |
---|---|---|---|---|
id_100 | 0 | 38.699 | 38.699 | 0 |
id_99 | 0 | 39.451 | 39.451 | 0 |
id_98 | 0 | 45.135 | 45.135 | 1 |
id_97 | 0 | 37.289 | 37.289 | 2 |
id_96 | 0 | 31.545 | 31.545 | 3 |
id_95 | 0 | 36.751 | 36.751 | 2 |
id_94 | 0 | 37.384 | 37.384 | 0 |
id_93 | 0 | 34.069 | 34.069 | 0 |
id_92 | 0 | 38.942 | 38.942 | 0 |
id_91 | 0 | 42.743 | 42.743 | 2 |
Run sample summary on subset
Using the sample_ids
and feature_ids
arguments you can run the summary for a subset of the data. Note: all
rows will be return, however summary data will only be returned for the
specified ids.
sample_summ <- sample_summary(metaboprep = m,
source_layer = "input",
outlier_udist = 1.0,
sample_ids = c("id_96", "id_97", "id_98", "id_99", "id_100"),
feature_ids = c("metab_id_1", "metab_id_2", "metab_id_3"),
output = "data.frame")
sample_id | missingness | tpa_total | tpa_complete_features | outlier_count |
---|---|---|---|---|
id_100 | 0 | 4.876 | 4.876 | 0 |
id_99 | 0 | 6.632 | 6.632 | 1 |
id_98 | 0 | 5.907 | 5.907 | 0 |
id_97 | 0 | 6.575 | 6.575 | 1 |
id_96 | 0 | 0.870 | 0.870 | 2 |
id_95 | NA | NA | NA | NA |
id_94 | NA | NA | NA | NA |
id_93 | NA | NA | NA | NA |
id_92 | NA | NA | NA | NA |
id_91 | NA | NA | NA | NA |
Run PCA analysis
pc_and_outliers()
performs principal component analysis.
Missing data is imputed to the median and used to identify the number of
informative or ‘significant’ PCs by (1) an acceleration analysis, and
(2) a parallel analysis. Finally the number of sample outliers are
determined at 3, 4, and 5 standard deviations from the mean on the top
PCs as determined by the acceleration factor analysis.
pc_analysis <- pc_and_outliers(metaboprep = m,
source_layer = "input")
sample_id | pc1 | pc2 | pc3 | pc4 | pc5 | pc6 | pc7 | pc8 | pc9 | pc10 | pc1_3_sd_outlier | pc2_3_sd_outlier | pc3_3_sd_outlier | pc1_4_sd_outlier | pc2_4_sd_outlier | pc3_4_sd_outlier | pc1_5_sd_outlier | pc2_5_sd_outlier | pc3_5_sd_outlier |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id_100 | 1.087 | 0.321 | 0.003 | 0.477 | 0.837 | 0.256 | -0.162 | -0.276 | 0.577 | -1.466 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
id_99 | 0.418 | 0.573 | 1.581 | 0.110 | -1.154 | -0.256 | -0.103 | 0.115 | 1.961 | -1.201 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
id_98 | -2.249 | 0.131 | 0.040 | -0.605 | -0.720 | 0.763 | -0.614 | 0.076 | -0.521 | -1.457 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
id_97 | -0.619 | 0.749 | 0.544 | 1.956 | -1.659 | -0.174 | -0.669 | -2.172 | -0.463 | 1.769 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
id_96 | 2.231 | -0.181 | -0.938 | 0.038 | -0.776 | 2.453 | 1.494 | 1.072 | 0.236 | 0.349 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
id_95 | -2.266 | -0.147 | -0.042 | -0.775 | -0.625 | -0.426 | -0.755 | 1.979 | 0.624 | -0.591 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
id_94 | 2.495 | 0.351 | 0.568 | 1.273 | 1.215 | 0.748 | 0.420 | 0.591 | -0.047 | 1.117 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
id_93 | 0.941 | -0.274 | 1.458 | -1.740 | 0.901 | -0.859 | -0.361 | 0.689 | -2.045 | 1.248 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
id_92 | -1.384 | -0.458 | 1.154 | 1.678 | -0.977 | -1.009 | -0.017 | 0.656 | 1.952 | 0.756 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
id_91 | -1.717 | 2.883 | 0.230 | 0.741 | -0.961 | 1.664 | 1.846 | -0.801 | -0.530 | 0.544 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Additional attributes
In addition, the variance explained vector is appended to the
returned data.frame
as and attribute
. This can
be accessed with the attribute name: [source_layer]_varexp
,
in this case we used the input
data, therefore the
attribute name is input_varexp
. In a similar way, the
results of the acceleration analysis (input_num_pcs_scree
)
and a parallel analysis (input_num_pcs_parallel
) can also
be extracted.
library(ggplot2)
# extract varexp from attributes
varexp <- attr(pc_analysis, 'input_varexp')
# subset to top 100 for nicer plotting
if (length(varexp) > 100) varexp <- varexp[1:100]
# get acceleration and parallel analysis results
af <- attr(pc_analysis, 'input_num_pcs_scree')
np <- attr(pc_analysis, 'input_num_pcs_parallel')
if (af==np) np <- np+0.1 # make line visible if equal
# as data.frame
x_labs <- sub("(?i)pc","", names(varexp))
ve <- data.frame("pc" = factor(x_labs, levels=x_labs),
"var_exp" = varexp)
lines <- data.frame("Analysis" = c("Acceleration", "Parallel"),
"pc" = c(af, np))
# plot
ggplot(ve, aes(x = pc, y = var_exp)) +
geom_line(color = "grey") +
geom_point(shape = 21, fill = "#377EB8", size = 2) +
geom_vline(data = lines, aes(xintercept = pc, color = Analysis), inherit.aes = FALSE) +
scale_color_manual(values = c("Acceleration"="#E41A1C", "Parallel"="#4DAF4A")) +
scale_x_discrete(labels = function(x) ifelse(seq_along(x) %% 10 == 0 | x==1, x, "")) +
labs(x = "PC", y = "Variance explained") +
theme_classic() +
theme(legend.position = "top")
Run sample & feature summaries together
summ <- summarise(metaboprep = m,
source_layer = "input",
outlier_udist = 1.0,
tree_cut_height = 0.5,
output = "data.frame")
str(summ)
#> List of 2
#> $ sample_summary :'data.frame': 100 obs. of 24 variables:
#> ..$ sample_id : chr [1:100] "id_100" "id_99" "id_98" "id_97" ...
#> ..$ missingness : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ tpa_total : num [1:100] 38.7 39.5 45.1 37.3 31.5 ...
#> ..$ tpa_complete_features: num [1:100] 38.7 39.5 45.1 37.3 31.5 ...
#> ..$ outlier_count : num [1:100] 0 0 1 2 3 2 0 0 0 2 ...
#> ..$ pc1 : num [1:100] 1.087 0.418 -2.249 -0.619 2.231 ...
#> ..$ pc2 : num [1:100] 0.321 0.573 0.131 0.749 -0.181 ...
#> ..$ pc3 : num [1:100] 0.00338 1.58112 0.04016 0.5436 -0.9382 ...
#> ..$ pc4 : num [1:100] 0.4766 0.1105 -0.6054 1.9563 0.0382 ...
#> ..$ pc5 : num [1:100] 0.837 -1.154 -0.72 -1.659 -0.776 ...
#> ..$ pc6 : num [1:100] 0.256 -0.256 0.763 -0.174 2.453 ...
#> ..$ pc7 : num [1:100] -0.162 -0.103 -0.614 -0.669 1.494 ...
#> ..$ pc8 : num [1:100] -0.2756 0.1153 0.0756 -2.1717 1.0719 ...
#> ..$ pc9 : num [1:100] 0.577 1.961 -0.521 -0.463 0.236 ...
#> ..$ pc10 : num [1:100] -1.466 -1.201 -1.457 1.769 0.349 ...
#> ..$ pc1_3_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc2_3_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc3_3_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc1_4_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc2_4_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc3_4_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc1_5_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc2_5_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc3_5_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..- attr(*, "input_varexp")= Named num [1:20] 0.0996 0.0884 0.0795 0.0691 0.0669 ...
#> .. ..- attr(*, "names")= chr [1:20] "PC1" "PC2" "PC3" "PC4" ...
#> ..- attr(*, "input_num_pcs_scree")= num 3
#> ..- attr(*, "input_num_pcs_parallel")= int 14
#> ..- attr(*, "input_outlier_udist")= num 1
#> $ feature_summary:'data.frame': 20 obs. of 21 variables:
#> ..$ feature_id : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#> ..$ missingness : num [1:20] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ outlier_count : num [1:20] 5 0 10 5 11 7 7 0 10 0 ...
#> ..$ n : num [1:20] 100 100 100 100 100 100 100 100 100 100 ...
#> ..$ mean : num [1:20] 0.511 0.521 0.488 0.464 0.521 ...
#> ..$ sd : num [1:20] 0.293 0.31 0.283 0.286 0.293 ...
#> ..$ median : num [1:20] 0.53 0.547 0.504 0.466 0.547 ...
#> ..$ min : num [1:20] 0.000465 0.018364 0.001192 0.004107 0.003896 ...
#> ..$ max : num [1:20] 0.993 0.993 0.995 0.992 0.976 ...
#> ..$ range : num [1:20] 0.992 0.975 0.994 0.988 0.972 ...
#> ..$ skew : num [1:20] -0.123 -0.1496 -0.0365 0.0924 -0.2185 ...
#> ..$ kurtosis : num [1:20] -1.23 -1.4 -1.11 -1.2 -1.16 ...
#> ..$ se : num [1:20] 0.0293 0.031 0.0283 0.0286 0.0293 ...
#> ..$ missing : num [1:20] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ var : num [1:20] 0.0859 0.0959 0.0804 0.0819 0.0856 ...
#> ..$ disp_index : num [1:20] 0.168 0.184 0.165 0.177 0.164 ...
#> ..$ coef_variance : num [1:20] 0.574 0.594 0.58 0.617 0.561 ...
#> ..$ W : num [1:20] 0.949 0.924 0.963 0.954 0.945 ...
#> ..$ log10_W : num [1:20] 0.744 0.834 0.749 0.833 0.782 ...
#> ..$ k : int [1:20] 1 2 3 4 5 6 7 8 9 10 ...
#> ..$ independent_features: logi [1:20] TRUE TRUE TRUE TRUE TRUE TRUE ...
#> ..- attr(*, "input_tree")=List of 7
#> .. ..$ merge : int [1:19, 1:2] -11 -2 -3 -13 -10 -8 -4 -12 -9 -1 ...
#> .. ..$ height : num [1:19] 0.672 0.723 0.764 0.771 0.774 ...
#> .. ..$ order : int [1:20] 3 6 4 5 8 15 10 19 7 1 ...
#> .. ..$ labels : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#> .. ..$ method : chr "complete"
#> .. ..$ call : language stats::hclust(d = dist_matrix, method = "complete")
#> .. ..$ dist.method: NULL
#> .. ..- attr(*, "class")= chr "hclust"
#> ..- attr(*, "input_outlier_udist")= num 1
#> ..- attr(*, "input_tree_cut_height")= num 0.5