Sample summary • metaboprep

Create Metaboprep object

library(metaboprep)

# import data
data     <- read.csv(system.file("extdata", "dummy_data.csv",     package = "metaboprep"), header=T, row.names = 1) |> as.matrix()
samples  <- read.csv(system.file("extdata", "dummy_samples.csv",  package = "metaboprep"), header=T, row.names = 1)
features <- read.csv(system.file("extdata", "dummy_features.csv", package = "metaboprep"), header=T, row.names = 1)

# create object
m <- Metaboprep(data = data, samples = samples, features = features)

# print
m
#> <metaboprep::Metaboprep>
#>  @ data           : num [1:100, 1:20, 1] 0.755887 0.662386 0.444527 0.627146 0.000465 ...
#>  .. - attr(*, "dimnames")=List of 3
#>  ..  ..$ : chr [1:100] "id_100" "id_99" "id_98" "id_97" ...
#>  ..  ..$ : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#>  ..  ..$ : chr "input"
#>  @ samples        :'data.frame': 100 obs. of  5 variables:
#>  .. $ sample_id: chr  "id_100" "id_99" "id_98" "id_97" ...
#>  .. $ age      : int  29 47 65 57 52 40 42 63 49 42 ...
#>  .. $ sex      : chr  "male" "male" "female" "female" ...
#>  .. $ pos      : chr  "batch2" "batch1" "batch2" "batch1" ...
#>  .. $ neg      : chr  "batch2" "batch2" "batch2" "batch1" ...
#>  @ features       :'data.frame': 20 obs. of  5 variables:
#>  .. $ feature_id        : chr  "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#>  .. $ platform          : chr  "neg" "neg" "neg" "pos" ...
#>  .. $ pathway           : logi  NA NA NA NA NA NA ...
#>  .. $ derived_feature   : logi  TRUE FALSE FALSE FALSE FALSE FALSE ...
#>  .. $ xenobiotic_feature: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
#>  @ exclusions     :List of 2
#>  .. $ samples :List of 5
#>  ..  ..$ user_excluded                    : chr(0) 
#>  ..  ..$ extreme_sample_missingness       : chr(0) 
#>  ..  ..$ user_defined_sample_missingness  : chr(0) 
#>  ..  ..$ user_defined_sample_totalpeakarea: chr(0) 
#>  ..  ..$ user_defined_sample_pca_outlier  : chr(0) 
#>  .. $ features:List of 3
#>  ..  ..$ user_excluded                   : chr(0) 
#>  ..  ..$ extreme_feature_missingness     : chr(0) 
#>  ..  ..$ user_defined_feature_missingness: chr(0) 
#>  @ feature_summary: num[0 , 0 , 0 ] 
#>  @ sample_summary : num[0 , 0 , 0 ]

Run sample summary

sample_summ <- sample_summary(metaboprep    = m, 
                              source_layer  = "input", 
                              outlier_udist = 1.0, 
                              output        = "data.frame")

sample_id	tpa_total	tpa_complete_features	outlier_count
id_100	38.699	38.699	0
id_99	39.451	39.451	0
id_98	45.135	45.135	1
id_97	37.289	37.289	2
id_96	31.545	31.545	3
id_95	36.751	36.751	2
id_94	37.384	37.384	0
id_93	34.069	34.069	0
id_92	38.942	38.942	0
id_91	42.743	42.743	2

Run sample summary on subset

Using the sample_ids and feature_ids arguments you can run the summary for a subset of the data. Note: all rows will be return, however summary data will only be returned for the specified ids.

sample_summ <- sample_summary(metaboprep    = m, 
                              source_layer  = "input", 
                              outlier_udist = 1.0,
                              sample_ids    = c("id_96", "id_97", "id_98", "id_99", "id_100"),
                              feature_ids   = c("metab_id_1", "metab_id_2", "metab_id_3"),
                              output        = "data.frame")

sample_id	missingness	tpa_total	tpa_complete_features	outlier_count
id_100	0	4.876	4.876	0
id_99	0	6.632	6.632	1
id_98	0	5.907	5.907	0
id_97	0	6.575	6.575	1
id_96	0	0.870	0.870	2
id_95	NA	NA	NA	NA
id_94	NA	NA	NA	NA
id_93	NA	NA	NA	NA
id_92	NA	NA	NA	NA
id_91	NA	NA	NA	NA

Run PCA analysis

pc_and_outliers() performs principal component analysis. Missing data is imputed to the median and used to identify the number of informative or ‘significant’ PCs by (1) an acceleration analysis, and (2) a parallel analysis. Finally the number of sample outliers are determined at 3, 4, and 5 standard deviations from the mean on the top PCs as determined by the acceleration factor analysis.

pc_analysis <- pc_and_outliers(metaboprep      = m, 
                               source_layer    = "input")

sample_id	pc1	pc2	pc3	pc4	pc5	pc6	pc7	pc8	pc9	pc10
id_100	1.087	0.321	0.003	0.477	0.837	0.256	-0.162	-0.276	0.577	-1.466
id_99	0.418	0.573	1.581	0.110	-1.154	-0.256	-0.103	0.115	1.961	-1.201
id_98	-2.249	0.131	0.040	-0.605	-0.720	0.763	-0.614	0.076	-0.521	-1.457
id_97	-0.619	0.749	0.544	1.956	-1.659	-0.174	-0.669	-2.172	-0.463	1.769
id_96	2.231	-0.181	-0.938	0.038	-0.776	2.453	1.494	1.072	0.236	0.349
id_95	-2.266	-0.147	-0.042	-0.775	-0.625	-0.426	-0.755	1.979	0.624	-0.591
id_94	2.495	0.351	0.568	1.273	1.215	0.748	0.420	0.591	-0.047	1.117
id_93	0.941	-0.274	1.458	-1.740	0.901	-0.859	-0.361	0.689	-2.045	1.248
id_92	-1.384	-0.458	1.154	1.678	-0.977	-1.009	-0.017	0.656	1.952	0.756
id_91	-1.717	2.883	0.230	0.741	-0.961	1.664	1.846	-0.801	-0.530	0.544

Additional attributes

In addition, the variance explained vector is appended to the returned data.frame as and attribute. This can be accessed with the attribute name: [source_layer]_varexp, in this case we used the input data, therefore the attribute name is input_varexp. In a similar way, the results of the acceleration analysis (input_num_pcs_scree) and a parallel analysis (input_num_pcs_parallel) can also be extracted.

library(ggplot2)

# extract varexp from attributes
varexp <- attr(pc_analysis, 'input_varexp')

# subset to top 100 for nicer plotting
if (length(varexp) > 100) varexp <- varexp[1:100]

# get acceleration and parallel analysis results
af <- attr(pc_analysis, 'input_num_pcs_scree')
np <- attr(pc_analysis, 'input_num_pcs_parallel')
if (af==np) np <- np+0.1 # make line visible if equal

# as data.frame
x_labs <- sub("(?i)pc","", names(varexp))
ve     <- data.frame("pc"      = factor(x_labs, levels=x_labs),
                     "var_exp" = varexp)
lines  <- data.frame("Analysis" = c("Acceleration", "Parallel"), 
                     "pc"       = c(af, np))   

# plot
ggplot(ve, aes(x = pc, y = var_exp)) +
  geom_line(color = "grey") +
  geom_point(shape = 21, fill = "#377EB8", size = 2) +
  geom_vline(data = lines, aes(xintercept = pc, color = Analysis), inherit.aes = FALSE) +
  scale_color_manual(values = c("Acceleration"="#E41A1C", "Parallel"="#4DAF4A")) +
  scale_x_discrete(labels = function(x) ifelse(seq_along(x) %% 10 == 0 | x==1, x, "")) +
  labs(x = "PC", y = "Variance explained") +
  theme_classic() +
  theme(legend.position = "top")

Run sample & feature summaries together

summ <- summarise(metaboprep      = m, 
                  source_layer    = "input", 
                  outlier_udist   = 1.0,
                  tree_cut_height = 0.5,
                  output          = "data.frame")

str(summ)
#> List of 2
#>  $ sample_summary :'data.frame': 100 obs. of  24 variables:
#>   ..$ sample_id            : chr [1:100] "id_100" "id_99" "id_98" "id_97" ...
#>   ..$ missingness          : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ tpa_total            : num [1:100] 38.7 39.5 45.1 37.3 31.5 ...
#>   ..$ tpa_complete_features: num [1:100] 38.7 39.5 45.1 37.3 31.5 ...
#>   ..$ outlier_count        : num [1:100] 0 0 1 2 3 2 0 0 0 2 ...
#>   ..$ pc1                  : num [1:100] 1.087 0.418 -2.249 -0.619 2.231 ...
#>   ..$ pc2                  : num [1:100] 0.321 0.573 0.131 0.749 -0.181 ...
#>   ..$ pc3                  : num [1:100] 0.00338 1.58112 0.04016 0.5436 -0.9382 ...
#>   ..$ pc4                  : num [1:100] 0.4766 0.1105 -0.6054 1.9563 0.0382 ...
#>   ..$ pc5                  : num [1:100] 0.837 -1.154 -0.72 -1.659 -0.776 ...
#>   ..$ pc6                  : num [1:100] 0.256 -0.256 0.763 -0.174 2.453 ...
#>   ..$ pc7                  : num [1:100] -0.162 -0.103 -0.614 -0.669 1.494 ...
#>   ..$ pc8                  : num [1:100] -0.2756 0.1153 0.0756 -2.1717 1.0719 ...
#>   ..$ pc9                  : num [1:100] 0.577 1.961 -0.521 -0.463 0.236 ...
#>   ..$ pc10                 : num [1:100] -1.466 -1.201 -1.457 1.769 0.349 ...
#>   ..$ pc1_3_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc2_3_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc3_3_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc1_4_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc2_4_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc3_4_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc1_5_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc2_5_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc3_5_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..- attr(*, "input_varexp")= Named num [1:20] 0.0996 0.0884 0.0795 0.0691 0.0669 ...
#>   .. ..- attr(*, "names")= chr [1:20] "PC1" "PC2" "PC3" "PC4" ...
#>   ..- attr(*, "input_num_pcs_scree")= num 3
#>   ..- attr(*, "input_num_pcs_parallel")= int 14
#>   ..- attr(*, "input_outlier_udist")= num 1
#>  $ feature_summary:'data.frame': 20 obs. of  21 variables:
#>   ..$ feature_id          : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#>   ..$ missingness         : num [1:20] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ outlier_count       : num [1:20] 5 0 10 5 11 7 7 0 10 0 ...
#>   ..$ n                   : num [1:20] 100 100 100 100 100 100 100 100 100 100 ...
#>   ..$ mean                : num [1:20] 0.511 0.521 0.488 0.464 0.521 ...
#>   ..$ sd                  : num [1:20] 0.293 0.31 0.283 0.286 0.293 ...
#>   ..$ median              : num [1:20] 0.53 0.547 0.504 0.466 0.547 ...
#>   ..$ min                 : num [1:20] 0.000465 0.018364 0.001192 0.004107 0.003896 ...
#>   ..$ max                 : num [1:20] 0.993 0.993 0.995 0.992 0.976 ...
#>   ..$ range               : num [1:20] 0.992 0.975 0.994 0.988 0.972 ...
#>   ..$ skew                : num [1:20] -0.123 -0.1496 -0.0365 0.0924 -0.2185 ...
#>   ..$ kurtosis            : num [1:20] -1.23 -1.4 -1.11 -1.2 -1.16 ...
#>   ..$ se                  : num [1:20] 0.0293 0.031 0.0283 0.0286 0.0293 ...
#>   ..$ missing             : num [1:20] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ var                 : num [1:20] 0.0859 0.0959 0.0804 0.0819 0.0856 ...
#>   ..$ disp_index          : num [1:20] 0.168 0.184 0.165 0.177 0.164 ...
#>   ..$ coef_variance       : num [1:20] 0.574 0.594 0.58 0.617 0.561 ...
#>   ..$ W                   : num [1:20] 0.949 0.924 0.963 0.954 0.945 ...
#>   ..$ log10_W             : num [1:20] 0.744 0.834 0.749 0.833 0.782 ...
#>   ..$ k                   : int [1:20] 1 2 3 4 5 6 7 8 9 10 ...
#>   ..$ independent_features: logi [1:20] TRUE TRUE TRUE TRUE TRUE TRUE ...
#>   ..- attr(*, "input_tree")=List of 7
#>   .. ..$ merge      : int [1:19, 1:2] -11 -2 -3 -13 -10 -8 -4 -12 -9 -1 ...
#>   .. ..$ height     : num [1:19] 0.672 0.723 0.764 0.771 0.774 ...
#>   .. ..$ order      : int [1:20] 3 6 4 5 8 15 10 19 7 1 ...
#>   .. ..$ labels     : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#>   .. ..$ method     : chr "complete"
#>   .. ..$ call       : language stats::hclust(d = dist_matrix, method = "complete")
#>   .. ..$ dist.method: NULL
#>   .. ..- attr(*, "class")= chr "hclust"
#>   ..- attr(*, "input_outlier_udist")= num 1
#>   ..- attr(*, "input_tree_cut_height")= num 0.5