Skip to contents

Create Metaboprep object

library(metaboprep)

# import data
data     <- read.csv(system.file("extdata", "dummy_data.csv",     package = "metaboprep"), header=T, row.names = 1) |> as.matrix()
samples  <- read.csv(system.file("extdata", "dummy_samples.csv",  package = "metaboprep"), header=T, row.names = 1)
features <- read.csv(system.file("extdata", "dummy_features.csv", package = "metaboprep"), header=T, row.names = 1)

# create object
m <- Metaboprep(data = data, samples = samples, features = features)

# print
m
#> <metaboprep::Metaboprep>
#>  @ data           : num [1:100, 1:20, 1] 0.755887 0.662386 0.444527 0.627146 0.000465 ...
#>  .. - attr(*, "dimnames")=List of 3
#>  ..  ..$ : chr [1:100] "id_100" "id_99" "id_98" "id_97" ...
#>  ..  ..$ : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#>  ..  ..$ : chr "input"
#>  @ samples        :'data.frame': 100 obs. of  5 variables:
#>  .. $ sample_id: chr  "id_100" "id_99" "id_98" "id_97" ...
#>  .. $ age      : int  29 47 65 57 52 40 42 63 49 42 ...
#>  .. $ sex      : chr  "male" "male" "female" "female" ...
#>  .. $ pos      : chr  "batch2" "batch1" "batch2" "batch1" ...
#>  .. $ neg      : chr  "batch2" "batch2" "batch2" "batch1" ...
#>  @ features       :'data.frame': 20 obs. of  5 variables:
#>  .. $ feature_id        : chr  "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#>  .. $ platform          : chr  "neg" "neg" "neg" "pos" ...
#>  .. $ pathway           : logi  NA NA NA NA NA NA ...
#>  .. $ derived_feature   : logi  TRUE FALSE FALSE FALSE FALSE FALSE ...
#>  .. $ xenobiotic_feature: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
#>  @ exclusions     :List of 2
#>  .. $ samples :List of 5
#>  ..  ..$ user_excluded                    : chr(0) 
#>  ..  ..$ extreme_sample_missingness       : chr(0) 
#>  ..  ..$ user_defined_sample_missingness  : chr(0) 
#>  ..  ..$ user_defined_sample_totalpeakarea: chr(0) 
#>  ..  ..$ user_defined_sample_pca_outlier  : chr(0) 
#>  .. $ features:List of 3
#>  ..  ..$ user_excluded                   : chr(0) 
#>  ..  ..$ extreme_feature_missingness     : chr(0) 
#>  ..  ..$ user_defined_feature_missingness: chr(0) 
#>  @ feature_summary: num[0 , 0 , 0 ] 
#>  @ sample_summary : num[0 , 0 , 0 ]

Run sample summary

sample_summ <- sample_summary(metaboprep    = m, 
                              source_layer  = "input", 
                              outlier_udist = 1.0, 
                              output        = "data.frame")
sample_id missingness tpa_total tpa_complete_features outlier_count
id_100 0 38.699 38.699 0
id_99 0 39.451 39.451 0
id_98 0 45.135 45.135 1
id_97 0 37.289 37.289 2
id_96 0 31.545 31.545 3
id_95 0 36.751 36.751 2
id_94 0 37.384 37.384 0
id_93 0 34.069 34.069 0
id_92 0 38.942 38.942 0
id_91 0 42.743 42.743 2

Run sample summary on subset

Using the sample_ids and feature_ids arguments you can run the summary for a subset of the data. Note: all rows will be return, however summary data will only be returned for the specified ids.

sample_summ <- sample_summary(metaboprep    = m, 
                              source_layer  = "input", 
                              outlier_udist = 1.0,
                              sample_ids    = c("id_96", "id_97", "id_98", "id_99", "id_100"),
                              feature_ids   = c("metab_id_1", "metab_id_2", "metab_id_3"),
                              output        = "data.frame")
sample_id missingness tpa_total tpa_complete_features outlier_count
id_100 0 4.876 4.876 0
id_99 0 6.632 6.632 1
id_98 0 5.907 5.907 0
id_97 0 6.575 6.575 1
id_96 0 0.870 0.870 2
id_95 NA NA NA NA
id_94 NA NA NA NA
id_93 NA NA NA NA
id_92 NA NA NA NA
id_91 NA NA NA NA

Run PCA analysis

pc_and_outliers() performs principal component analysis. Missing data is imputed to the median and used to identify the number of informative or ‘significant’ PCs by (1) an acceleration analysis, and (2) a parallel analysis. Finally the number of sample outliers are determined at 3, 4, and 5 standard deviations from the mean on the top PCs as determined by the acceleration factor analysis.

pc_analysis <- pc_and_outliers(metaboprep      = m, 
                               source_layer    = "input")
sample_id pc1 pc2 pc3 pc4 pc5 pc6 pc7 pc8 pc9 pc10 pc1_3_sd_outlier pc2_3_sd_outlier pc3_3_sd_outlier pc1_4_sd_outlier pc2_4_sd_outlier pc3_4_sd_outlier pc1_5_sd_outlier pc2_5_sd_outlier pc3_5_sd_outlier
id_100 1.087 0.321 0.003 0.477 0.837 0.256 -0.162 -0.276 0.577 -1.466 0 0 0 0 0 0 0 0 0
id_99 0.418 0.573 1.581 0.110 -1.154 -0.256 -0.103 0.115 1.961 -1.201 0 0 0 0 0 0 0 0 0
id_98 -2.249 0.131 0.040 -0.605 -0.720 0.763 -0.614 0.076 -0.521 -1.457 0 0 0 0 0 0 0 0 0
id_97 -0.619 0.749 0.544 1.956 -1.659 -0.174 -0.669 -2.172 -0.463 1.769 0 0 0 0 0 0 0 0 0
id_96 2.231 -0.181 -0.938 0.038 -0.776 2.453 1.494 1.072 0.236 0.349 0 0 0 0 0 0 0 0 0
id_95 -2.266 -0.147 -0.042 -0.775 -0.625 -0.426 -0.755 1.979 0.624 -0.591 0 0 0 0 0 0 0 0 0
id_94 2.495 0.351 0.568 1.273 1.215 0.748 0.420 0.591 -0.047 1.117 0 0 0 0 0 0 0 0 0
id_93 0.941 -0.274 1.458 -1.740 0.901 -0.859 -0.361 0.689 -2.045 1.248 0 0 0 0 0 0 0 0 0
id_92 -1.384 -0.458 1.154 1.678 -0.977 -1.009 -0.017 0.656 1.952 0.756 0 0 0 0 0 0 0 0 0
id_91 -1.717 2.883 0.230 0.741 -0.961 1.664 1.846 -0.801 -0.530 0.544 0 0 0 0 0 0 0 0 0

Additional attributes

In addition, the variance explained vector is appended to the returned data.frame as and attribute. This can be accessed with the attribute name: [source_layer]_varexp, in this case we used the input data, therefore the attribute name is input_varexp. In a similar way, the results of the acceleration analysis (input_num_pcs_scree) and a parallel analysis (input_num_pcs_parallel) can also be extracted.

library(ggplot2)

# extract varexp from attributes
varexp <- attr(pc_analysis, 'input_varexp')

# subset to top 100 for nicer plotting
if (length(varexp) > 100) varexp <- varexp[1:100]

# get acceleration and parallel analysis results
af <- attr(pc_analysis, 'input_num_pcs_scree')
np <- attr(pc_analysis, 'input_num_pcs_parallel')
if (af==np) np <- np+0.1 # make line visible if equal

# as data.frame
x_labs <- sub("(?i)pc","", names(varexp))
ve     <- data.frame("pc"      = factor(x_labs, levels=x_labs),
                     "var_exp" = varexp)
lines  <- data.frame("Analysis" = c("Acceleration", "Parallel"), 
                     "pc"       = c(af, np))   

# plot
ggplot(ve, aes(x = pc, y = var_exp)) +
  geom_line(color = "grey") +
  geom_point(shape = 21, fill = "#377EB8", size = 2) +
  geom_vline(data = lines, aes(xintercept = pc, color = Analysis), inherit.aes = FALSE) +
  scale_color_manual(values = c("Acceleration"="#E41A1C", "Parallel"="#4DAF4A")) +
  scale_x_discrete(labels = function(x) ifelse(seq_along(x) %% 10 == 0 | x==1, x, "")) +
  labs(x = "PC", y = "Variance explained") +
  theme_classic() +
  theme(legend.position = "top")

Run sample & feature summaries together

summ <- summarise(metaboprep      = m, 
                  source_layer    = "input", 
                  outlier_udist   = 1.0,
                  tree_cut_height = 0.5,
                  output          = "data.frame")

str(summ)
#> List of 2
#>  $ sample_summary :'data.frame': 100 obs. of  24 variables:
#>   ..$ sample_id            : chr [1:100] "id_100" "id_99" "id_98" "id_97" ...
#>   ..$ missingness          : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ tpa_total            : num [1:100] 38.7 39.5 45.1 37.3 31.5 ...
#>   ..$ tpa_complete_features: num [1:100] 38.7 39.5 45.1 37.3 31.5 ...
#>   ..$ outlier_count        : num [1:100] 0 0 1 2 3 2 0 0 0 2 ...
#>   ..$ pc1                  : num [1:100] 1.087 0.418 -2.249 -0.619 2.231 ...
#>   ..$ pc2                  : num [1:100] 0.321 0.573 0.131 0.749 -0.181 ...
#>   ..$ pc3                  : num [1:100] 0.00338 1.58112 0.04016 0.5436 -0.9382 ...
#>   ..$ pc4                  : num [1:100] 0.4766 0.1105 -0.6054 1.9563 0.0382 ...
#>   ..$ pc5                  : num [1:100] 0.837 -1.154 -0.72 -1.659 -0.776 ...
#>   ..$ pc6                  : num [1:100] 0.256 -0.256 0.763 -0.174 2.453 ...
#>   ..$ pc7                  : num [1:100] -0.162 -0.103 -0.614 -0.669 1.494 ...
#>   ..$ pc8                  : num [1:100] -0.2756 0.1153 0.0756 -2.1717 1.0719 ...
#>   ..$ pc9                  : num [1:100] 0.577 1.961 -0.521 -0.463 0.236 ...
#>   ..$ pc10                 : num [1:100] -1.466 -1.201 -1.457 1.769 0.349 ...
#>   ..$ pc1_3_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc2_3_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc3_3_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc1_4_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc2_4_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc3_4_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc1_5_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc2_5_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ pc3_5_sd_outlier     : num [1:100] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..- attr(*, "input_varexp")= Named num [1:20] 0.0996 0.0884 0.0795 0.0691 0.0669 ...
#>   .. ..- attr(*, "names")= chr [1:20] "PC1" "PC2" "PC3" "PC4" ...
#>   ..- attr(*, "input_num_pcs_scree")= num 3
#>   ..- attr(*, "input_num_pcs_parallel")= int 14
#>   ..- attr(*, "input_outlier_udist")= num 1
#>  $ feature_summary:'data.frame': 20 obs. of  21 variables:
#>   ..$ feature_id          : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#>   ..$ missingness         : num [1:20] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ outlier_count       : num [1:20] 5 0 10 5 11 7 7 0 10 0 ...
#>   ..$ n                   : num [1:20] 100 100 100 100 100 100 100 100 100 100 ...
#>   ..$ mean                : num [1:20] 0.511 0.521 0.488 0.464 0.521 ...
#>   ..$ sd                  : num [1:20] 0.293 0.31 0.283 0.286 0.293 ...
#>   ..$ median              : num [1:20] 0.53 0.547 0.504 0.466 0.547 ...
#>   ..$ min                 : num [1:20] 0.000465 0.018364 0.001192 0.004107 0.003896 ...
#>   ..$ max                 : num [1:20] 0.993 0.993 0.995 0.992 0.976 ...
#>   ..$ range               : num [1:20] 0.992 0.975 0.994 0.988 0.972 ...
#>   ..$ skew                : num [1:20] -0.123 -0.1496 -0.0365 0.0924 -0.2185 ...
#>   ..$ kurtosis            : num [1:20] -1.23 -1.4 -1.11 -1.2 -1.16 ...
#>   ..$ se                  : num [1:20] 0.0293 0.031 0.0283 0.0286 0.0293 ...
#>   ..$ missing             : num [1:20] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ var                 : num [1:20] 0.0859 0.0959 0.0804 0.0819 0.0856 ...
#>   ..$ disp_index          : num [1:20] 0.168 0.184 0.165 0.177 0.164 ...
#>   ..$ coef_variance       : num [1:20] 0.574 0.594 0.58 0.617 0.561 ...
#>   ..$ W                   : num [1:20] 0.949 0.924 0.963 0.954 0.945 ...
#>   ..$ log10_W             : num [1:20] 0.744 0.834 0.749 0.833 0.782 ...
#>   ..$ k                   : int [1:20] 1 2 3 4 5 6 7 8 9 10 ...
#>   ..$ independent_features: logi [1:20] TRUE TRUE TRUE TRUE TRUE TRUE ...
#>   ..- attr(*, "input_tree")=List of 7
#>   .. ..$ merge      : int [1:19, 1:2] -11 -2 -3 -13 -10 -8 -4 -12 -9 -1 ...
#>   .. ..$ height     : num [1:19] 0.672 0.723 0.764 0.771 0.774 ...
#>   .. ..$ order      : int [1:20] 3 6 4 5 8 15 10 19 7 1 ...
#>   .. ..$ labels     : chr [1:20] "metab_id_1" "metab_id_2" "metab_id_3" "metab_id_4" ...
#>   .. ..$ method     : chr "complete"
#>   .. ..$ call       : language stats::hclust(d = dist_matrix, method = "complete")
#>   .. ..$ dist.method: NULL
#>   .. ..- attr(*, "class")= chr "hclust"
#>   ..- attr(*, "input_outlier_udist")= num 1
#>   ..- attr(*, "input_tree_cut_height")= num 0.5