
Feature summary
feature_summary.Rmd
Create Metaboprep object
library(metaboprep)
# import data
data <- read.csv(system.file("extdata", "dummy_data.csv", package = "metaboprep"), header=T, row.names = 1) |> as.matrix()
samples <- read.csv(system.file("extdata", "dummy_samples.csv", package = "metaboprep"), header=T, row.names = 1)
features <- read.csv(system.file("extdata", "dummy_features.csv", package = "metaboprep"), header=T, row.names = 1)
# create object
m <- Metaboprep(data = data, samples = samples, features = features)
# print
m
#> <metaboprep::Metaboprep>
#> @ data : num [1:100, 1:20, 1] 0.755887 0.662386 0.444527 0.627146 0.000465 ...
#> .. - attr(*, "dimnames")=List of 3
#> .. ..$ : chr [1:100] "id_100" "id_99" "id_98" "id_97" ...
#> .. ..$ : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#> .. ..$ : chr "input"
#> @ samples :'data.frame': 100 obs. of 5 variables:
#> .. $ sample_id: chr "id_100" "id_99" "id_98" "id_97" ...
#> .. $ age : int 29 47 65 57 52 40 42 63 49 42 ...
#> .. $ sex : chr "male" "male" "female" "female" ...
#> .. $ pos : chr "batch2" "batch1" "batch2" "batch1" ...
#> .. $ neg : chr "batch2" "batch2" "batch2" "batch1" ...
#> @ features :'data.frame': 20 obs. of 5 variables:
#> .. $ feature_id : chr "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#> .. $ platform : chr "neg" "neg" "neg" "pos" ...
#> .. $ pathway : logi NA NA NA NA NA NA ...
#> .. $ derived_feature : logi TRUE FALSE FALSE FALSE FALSE FALSE ...
#> .. $ xenobiotic_feature: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
#> @ exclusions :List of 2
#> .. $ samples :List of 5
#> .. ..$ user_excluded : chr(0)
#> .. ..$ extreme_sample_missingness : chr(0)
#> .. ..$ user_defined_sample_missingness : chr(0)
#> .. ..$ user_defined_sample_totalpeakarea: chr(0)
#> .. ..$ user_defined_sample_pca_outlier : chr(0)
#> .. $ features:List of 3
#> .. ..$ user_excluded : chr(0)
#> .. ..$ extreme_feature_missingness : chr(0)
#> .. ..$ user_defined_feature_missingness: chr(0)
#> @ feature_summary: num[0 , 0 , 0 ]
#> @ sample_summary : num[0 , 0 , 0 ]
Run feature summary
feature_summ <- feature_summary(metaboprep = m,
source_layer = "input",
outlier_udist = 1.0,
tree_cut_height = 0.5,
output = "data.frame")
feature_id | missingness | outlier_count | n | mean | sd | median | min | max | range | skew | kurtosis | se | missing | var | disp_index | coef_variance | W | log10_W | k | independent_features |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
metab_id_1 | 0 | 5 | 100 | 0.511 | 0.293 | 0.530 | 0.000 | 0.993 | 0.992 | -0.123 | -1.231 | 0.029 | 0 | 0.086 | 0.168 | 0.574 | 0.949 | 0.744 | 1 | TRUE |
metab_id_2 | 0 | 0 | 100 | 0.521 | 0.310 | 0.547 | 0.018 | 0.993 | 0.975 | -0.150 | -1.404 | 0.031 | 0 | 0.096 | 0.184 | 0.594 | 0.924 | 0.834 | 2 | TRUE |
metab_id_3 | 0 | 10 | 100 | 0.488 | 0.283 | 0.504 | 0.001 | 0.995 | 0.994 | -0.036 | -1.109 | 0.028 | 0 | 0.080 | 0.165 | 0.580 | 0.963 | 0.749 | 3 | TRUE |
metab_id_4 | 0 | 5 | 100 | 0.464 | 0.286 | 0.466 | 0.004 | 0.992 | 0.988 | 0.092 | -1.199 | 0.029 | 0 | 0.082 | 0.177 | 0.617 | 0.954 | 0.833 | 4 | TRUE |
metab_id_5 | 0 | 11 | 100 | 0.521 | 0.293 | 0.547 | 0.004 | 0.976 | 0.972 | -0.219 | -1.161 | 0.029 | 0 | 0.086 | 0.164 | 0.561 | 0.945 | 0.782 | 5 | TRUE |
metab_id_6 | 0 | 7 | 100 | 0.490 | 0.259 | 0.473 | 0.007 | 0.993 | 0.986 | 0.007 | -1.006 | 0.026 | 0 | 0.067 | 0.137 | 0.528 | 0.973 | 0.803 | 6 | TRUE |
metab_id_7 | 0 | 7 | 100 | 0.479 | 0.277 | 0.441 | 0.029 | 0.992 | 0.963 | 0.135 | -1.211 | 0.028 | 0 | 0.077 | 0.160 | 0.579 | 0.953 | 0.899 | 7 | TRUE |
metab_id_8 | 0 | 0 | 100 | 0.476 | 0.312 | 0.491 | 0.001 | 0.999 | 0.998 | 0.059 | -1.350 | 0.031 | 0 | 0.097 | 0.205 | 0.656 | 0.936 | 0.796 | 8 | TRUE |
metab_id_9 | 0 | 10 | 100 | 0.468 | 0.260 | 0.489 | 0.005 | 0.975 | 0.971 | 0.000 | -1.090 | 0.026 | 0 | 0.068 | 0.144 | 0.556 | 0.968 | 0.800 | 9 | TRUE |
metab_id_10 | 0 | 0 | 100 | 0.524 | 0.290 | 0.532 | 0.019 | 0.993 | 0.974 | -0.158 | -1.252 | 0.029 | 0 | 0.084 | 0.161 | 0.554 | 0.945 | 0.841 | 10 | TRUE |
Feature summary attributes
In addition to the summary data, the hierarchical cluster dendrogram
is appended to the returned data.frame
as and
attribute
. This can be accessed with the attribute name:
[source_layer]_tree
, in this case we summarised the
input
data, therefore the attribute name is
input_tree
.
suppressPackageStartupMessages(library(dendextend))
library(ggplot2)
# extract tree from attributes
tree <- attr(feature_summ, 'input_tree')
dend <- stats::as.dendrogram(tree)
# color the independent features blue
metab_color <- feature_summ[, c("feature_id", "independent_features")]
metab_color <- metab_color[match(labels(dend), metab_color$feature_id), ]
metab_color$color <- ifelse(metab_color$independent_features==TRUE, "blue", "black")
# format dendrogram for ploting
dend <- dend |>
dendextend::set("labels_cex", 0.5) |>
dendextend::set("labels_col", metab_color$color) |>
dendextend::set("branches_lwd", 0.5) |>
dendextend::set("branches_k_color", value = metab_color$color)
# plot
ggplot(dend, horiz = TRUE) +
geom_hline(yintercept = 0.5, color = "coral2")
Run feature summary on subset
Using the sample_ids
and feature_ids
arguments you can run the summary for a subset of the data. Note: all
rows will be return, however summary data will only be returned for the
specified ids.
feature_summ <- feature_summary(metaboprep = m,
source_layer = "input",
outlier_udist = 1.0,
tree_cut_height = 0.5,
sample_ids = c("id_96", "id_97", "id_98", "id_99", "id_100"),
feature_ids = c("metab_id_1", "metab_id_2", "metab_id_3"),
output = "data.frame")
feature_id | missingness | outlier_count | n | mean | sd | median | min | max | range | skew | kurtosis | se | missing | var | disp_index | coef_variance | W | log10_W | k | independent_features |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
metab_id_1 | 0 | 1 | 5 | 0.498 | 0.300 | 0.627 | 0.000 | 0.756 | 0.755 | -0.736 | -1.357 | 0.134 | 0 | 0.090 | 0.181 | 0.603 | 0.853 | 0.606 | 1 | TRUE |
metab_id_2 | 0 | 2 | 5 | 0.503 | 0.350 | 0.423 | 0.018 | 0.950 | 0.931 | -0.078 | -1.715 | 0.157 | 0 | 0.123 | 0.244 | 0.697 | 0.970 | 0.751 | 2 | TRUE |
metab_id_3 | 0 | 1 | 5 | 0.460 | 0.243 | 0.420 | 0.202 | 0.807 | 0.605 | 0.294 | -1.842 | 0.109 | 0 | 0.059 | 0.128 | 0.528 | 0.958 | 0.980 | 3 | TRUE |
metab_id_4 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
metab_id_5 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
metab_id_6 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
metab_id_7 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
metab_id_8 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
metab_id_9 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
metab_id_10 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Run sample & feature summaries together
summ <- summarise(metaboprep = m,
source_layer = "input",
outlier_udist = 1.0,
tree_cut_height = 0.5,
output = "data.frame")
str(summ)
#> List of 2
#> $ sample_summary :'data.frame': 100 obs. of 24 variables:
#> ..$ sample_id : chr [1:100] "id_100" "id_99" "id_98" "id_97" ...
#> ..$ missingness : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ tpa_total : num [1:100] 38.7 39.5 45.1 37.3 31.5 ...
#> ..$ tpa_complete_features: num [1:100] 38.7 39.5 45.1 37.3 31.5 ...
#> ..$ outlier_count : num [1:100] 0 0 1 2 3 2 0 0 0 2 ...
#> ..$ pc1 : num [1:100] 1.087 0.418 -2.249 -0.619 2.231 ...
#> ..$ pc2 : num [1:100] 0.321 0.573 0.131 0.749 -0.181 ...
#> ..$ pc3 : num [1:100] 0.00338 1.58112 0.04016 0.5436 -0.9382 ...
#> ..$ pc4 : num [1:100] 0.4766 0.1105 -0.6054 1.9563 0.0382 ...
#> ..$ pc5 : num [1:100] 0.837 -1.154 -0.72 -1.659 -0.776 ...
#> ..$ pc6 : num [1:100] 0.256 -0.256 0.763 -0.174 2.453 ...
#> ..$ pc7 : num [1:100] -0.162 -0.103 -0.614 -0.669 1.494 ...
#> ..$ pc8 : num [1:100] -0.2756 0.1153 0.0756 -2.1717 1.0719 ...
#> ..$ pc9 : num [1:100] 0.577 1.961 -0.521 -0.463 0.236 ...
#> ..$ pc10 : num [1:100] -1.466 -1.201 -1.457 1.769 0.349 ...
#> ..$ pc1_3_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc2_3_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc3_3_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc1_4_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc2_4_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc3_4_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc1_5_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc2_5_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ pc3_5_sd_outlier : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#> ..- attr(*, "input_varexp")= Named num [1:20] 0.0996 0.0884 0.0795 0.0691 0.0669 ...
#> .. ..- attr(*, "names")= chr [1:20] "PC1" "PC2" "PC3" "PC4" ...
#> ..- attr(*, "input_num_pcs_scree")= num 3
#> ..- attr(*, "input_num_pcs_parallel")= int 14
#> ..- attr(*, "input_outlier_udist")= num 1
#> $ feature_summary:'data.frame': 20 obs. of 21 variables:
#> ..$ feature_id : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#> ..$ missingness : num [1:20] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ outlier_count : num [1:20] 5 0 10 5 11 7 7 0 10 0 ...
#> ..$ n : num [1:20] 100 100 100 100 100 100 100 100 100 100 ...
#> ..$ mean : num [1:20] 0.511 0.521 0.488 0.464 0.521 ...
#> ..$ sd : num [1:20] 0.293 0.31 0.283 0.286 0.293 ...
#> ..$ median : num [1:20] 0.53 0.547 0.504 0.466 0.547 ...
#> ..$ min : num [1:20] 0.000465 0.018364 0.001192 0.004107 0.003896 ...
#> ..$ max : num [1:20] 0.993 0.993 0.995 0.992 0.976 ...
#> ..$ range : num [1:20] 0.992 0.975 0.994 0.988 0.972 ...
#> ..$ skew : num [1:20] -0.123 -0.1496 -0.0365 0.0924 -0.2185 ...
#> ..$ kurtosis : num [1:20] -1.23 -1.4 -1.11 -1.2 -1.16 ...
#> ..$ se : num [1:20] 0.0293 0.031 0.0283 0.0286 0.0293 ...
#> ..$ missing : num [1:20] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ var : num [1:20] 0.0859 0.0959 0.0804 0.0819 0.0856 ...
#> ..$ disp_index : num [1:20] 0.168 0.184 0.165 0.177 0.164 ...
#> ..$ coef_variance : num [1:20] 0.574 0.594 0.58 0.617 0.561 ...
#> ..$ W : num [1:20] 0.949 0.924 0.963 0.954 0.945 ...
#> ..$ log10_W : num [1:20] 0.744 0.834 0.749 0.833 0.782 ...
#> ..$ k : int [1:20] 1 2 3 4 5 6 7 8 9 10 ...
#> ..$ independent_features: logi [1:20] TRUE TRUE TRUE TRUE TRUE TRUE ...
#> ..- attr(*, "input_tree")=List of 7
#> .. ..$ merge : int [1:19, 1:2] -11 -2 -3 -13 -10 -8 -4 -12 -9 -1 ...
#> .. ..$ height : num [1:19] 0.672 0.723 0.764 0.771 0.774 ...
#> .. ..$ order : int [1:20] 3 6 4 5 8 15 10 19 7 1 ...
#> .. ..$ labels : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#> .. ..$ method : chr "complete"
#> .. ..$ call : language stats::hclust(d = dist_matrix, method = "complete")
#> .. ..$ dist.method: NULL
#> .. ..- attr(*, "class")= chr "hclust"
#> ..- attr(*, "input_outlier_udist")= num 1
#> ..- attr(*, "input_tree_cut_height")= num 0.5