This RMarkdown document demonstrates how key elements from the notebook for case study 1 in the EpiGraphDB paper can be achieved using the R package. For detailed explanations of the case study please refer to the paper or the case study notebook.

Context

Mendelian randomization (MR), a technique to evaluate the causal role of modifiable exposures on a health outcome using a set of genetic instruments, tacitly assumes that a genetic variant (e.g. SNP) is only related to an outcome of interest through an exposure (i.e. the “exclusion restriction criterion”). Horizontal pleiotropy, where a SNP is associated with multiple phenotypes independently of the exposure of interest, potentially violates this assumption delivering misleading conclusions. In contrast, vertical pleiotropy, where a SNP is associated with multiple phenotypes on the same biological pathway, does not violate this assumption.

Here, we use external evidence on biological pathways and protein-protein interactions to assess the pleiotropic profile for a genetic variant based on the genes (and proteins) with which that variant is associated. In a graph representation, we will show that:

  • We have evidence of potential vertical pleiotropy when the proteins associated with a variant belong to one connected group,
  • In contrast, if the proteins associated with a single variant are isolated from each other, this suggests evidence of horizontal pleiotropy.

This case study goes as follows:

  • For a list of genes that we wish to assess their pleiotropic profiles (against a variant of interests), we retrieve from EpiGraphDB their mapped protein products.
  • Then respectively from the evidence on shared pathway network and protein-protein interaction network, we can extract the proteins groups formed in these networks as connected communities to assess the pleiotropic profiles of the genes.

Here we configure the parameters used in the case study example. In this example we will look at the variant rs12720356, a SNP located in chromosome 19 that has been associated with Crohn’s disease and psoriasis.**

SNP <- "rs12720356"

GENELIST <- c("ZGLP1", "FDX1L", "MRPL4", "ICAM5", "TYK2", "GRAP2", "KRI1", "TMED1", "ICAM1")

PPI_N_INTERMEDIATE_PROTEINS <- 1

Mapping genes to proteins

The first step of the analysis is to map each of these genes to their protein product.

get_gene_protein <- function(genelist) {
  endpoint <- "/mappings/gene-to-protein"
  params <- list(
    gene_name_list = genelist %>% I()
  )
  r <- query_epigraphdb(
    route = endpoint,
    params = params,
    mode = "table",
    method = "POST"
  )
  protein_df <- r
  if (nrow(protein_df) > 0) {
    res_df <- protein_df %>%
      select(gene_name = `gene.name`, uniprot_id = `protein.uniprot_id`)
  } else {
    res_df <- tibble() %>% set_names(c("gene_name", "uniprot_id"))
  }

  res_df
}

gene_protein_df <- get_gene_protein(genelist = GENELIST)
gene_protein_df
#> # A tibble: 9 x 2
#>   gene_name uniprot_id
#>   <chr>     <chr>     
#> 1 TMED1     Q13445    
#> 2 MRPL4     Q9BYD3    
#> 3 TYK2      P29597    
#> 4 KRI1      Q8N9T8    
#> 5 FDX1L     Q6P4F2    
#> 6 GRAP2     O75791    
#> 7 ICAM1     P05362    
#> 8 ICAM5     Q9UMF0    
#> 9 ZGLP1     P0C6A0

Identifying the involvement in the same biological pathways

For each protein we retrieve the pathways they are involved in.

get_protein_pathway <- function(gene_protein_df) {
  endpoint <- "/protein/in-pathway"
  params <- list(
    uniprot_id_list = gene_protein_df %>% pull(`uniprot_id`) %>% I()
  )
  df <- query_epigraphdb(route = endpoint, params = params, mode = "table", method = "POST")

  if (nrow(df) > 0) {
    res_df <- gene_protein_df %>%
      select(`uniprot_id`) %>%
      left_join(df, by = c("uniprot_id"))
  } else {
    res_df <- gene_protein_df %>%
      select(`uniprot_id`) %>%
      mutate(pathway_count = NA_integer_, pathway_reactome_id = NA_character_)
  }
  res_df <- res_df %>%
    mutate(
      pathway_count = ifelse(is.na(pathway_count), 0L, as.integer(pathway_count)),
      pathway_reactome_id = ifelse(is.na(pathway_reactome_id), c(), pathway_reactome_id)
    )

  res_df
}
pathway_df <- get_protein_pathway(gene_protein_df = gene_protein_df)
pathway_df
#> # A tibble: 9 x 3
#>   uniprot_id pathway_count pathway_reactome_id
#>   <chr>              <int> <list>             
#> 1 Q13445                 0 <NULL>             
#> 2 Q9BYD3                 0 <NULL>             
#> 3 P29597                 3 <chr [3]>          
#> 4 Q8N9T8                 0 <NULL>             
#> 5 Q6P4F2                 1 <chr [1]>          
#> 6 O75791                 3 <chr [3]>          
#> 7 P05362                 5 <chr [5]>          
#> 8 Q9UMF0                 2 <chr [2]>          
#> 9 P0C6A0                 0 <NULL>

Now for each pair of proteins we match the pathways they have in common.

get_shared_pathway <- function(pathway_df) {
  # For the protein-pathway data
  # Get protein-protein permutations where they share pathways

  per_permutation <- function(pathway_df, permutation) {
    df <- pathway_df %>% filter(uniprot_id %in% permutation)
    primary_pathway <- pathway_df %>%
      filter(uniprot_id == permutation[1]) %>%
      pull(pathway_reactome_id) %>%
      unlist()
    assoc_pathway <- pathway_df %>%
      filter(uniprot_id == permutation[2]) %>%
      pull(pathway_reactome_id) %>%
      unlist()
    intersect(primary_pathway, assoc_pathway)
  }

  pairwise_permutations <- pathway_df %>%
    pull(`uniprot_id`) %>%
    gtools::permutations(n = length(.), r = 2, v = .)
  shared_pathway_df <- tibble(
    protein = pairwise_permutations[, 1],
    assoc_protein = pairwise_permutations[, 2]
  ) %>%
    mutate(
      shared_pathway = map2(`protein`, `assoc_protein`, function(x, y) {
        per_permutation(pathway_df = pathway_df, permutation = c(x, y))
      }),
      combination = map2_chr(`protein`, `assoc_protein`, function(x, y) {
        comb <- sort(c(x, y))
        paste(comb, collapse = ",")
      }),
      count = map_int(`shared_pathway`, function(x) na.omit(x) %>% length()),
      connected = count > 0
    )

  shared_pathway_df
}
shared_pathway_df <- get_shared_pathway(pathway_df)
n_pairs <- length(shared_pathway_df %>% filter(count > 0))
print(glue::glue("Num. shared_pathway pairs: {n_pairs}"))
#> Num. shared_pathway pairs: 6
shared_pathway_df %>% arrange(desc(count))
#> # A tibble: 72 x 6
#>    protein assoc_protein shared_pathway combination   count connected
#>    <chr>   <chr>         <list>         <chr>         <int> <lgl>    
#>  1 P05362  Q9UMF0        <chr [2]>      P05362,Q9UMF0     2 TRUE     
#>  2 Q9UMF0  P05362        <chr [2]>      P05362,Q9UMF0     2 TRUE     
#>  3 P05362  P29597        <chr [1]>      P05362,P29597     1 TRUE     
#>  4 P29597  P05362        <chr [1]>      P05362,P29597     1 TRUE     
#>  5 O75791  P05362        <chr [0]>      O75791,P05362     0 FALSE    
#>  6 O75791  P0C6A0        <NULL>         O75791,P0C6A0     0 FALSE    
#>  7 O75791  P29597        <chr [0]>      O75791,P29597     0 FALSE    
#>  8 O75791  Q13445        <NULL>         O75791,Q13445     0 FALSE    
#>  9 O75791  Q6P4F2        <chr [0]>      O75791,Q6P4F2     0 FALSE    
#> 10 O75791  Q8N9T8        <NULL>         O75791,Q8N9T8     0 FALSE    
#> # … with 62 more rows

We can further query EpiGraphDB regarding the detailed pathway information using GET /meta/nodes/Pathway/search.

get_pathway_info <- function(reactome_id) {
  endpoint <- "/meta/nodes/Pathway/search"
  params <- list(id = reactome_id)
  df <- query_epigraphdb(route = endpoint, params = params, mode = "table")
  df
}

pathway <- shared_pathway_df %>%
  pull(shared_pathway) %>%
  unlist() %>%
  unique()

pathway_info <- pathway %>% map_df(get_pathway_info)
pathway_info %>% print()
#> # A tibble: 3 x 3
#>   node.in_disease node.name                                     node.reactome_id
#>   <chr>           <chr>                                         <chr>           
#> 1 FALSE           Interleukin-4 and Interleukin-13 signaling    R-HSA-6785807   
#> 2 FALSE           Integrin cell surface interactions            R-HSA-216083    
#> 3 FALSE           Immunoregulatory interactions between a Lymp… R-HSA-198933

In order to extract protein groups from the shared pathways, the last step for this query is to convert the shared pathway data into a graph where

  • Proteins are represented as nodes.
  • And shared pathways are represented as edges.
  • Then the protein groups (via shared pathways) are the connected communities in the graph.

We then count the number of nodes in each connected community and plot the graph.

protein_df_to_graph <- function(df) {
  df_connected <- df %>%
    filter(connected) %>%
    distinct(`combination`, .keep_all = TRUE)
  nodes <- df %>%
    pull(protein) %>%
    unique()
  graph <- igraph::graph_from_data_frame(
    df_connected,
    directed = FALSE, vertices = nodes
  )
  graph$layout <- igraph::layout_with_kk
  graph
}

graph_to_protein_groups <- function(graph) {
  graph %>%
    igraph::components() %>%
    igraph::groups() %>%
    tibble(group_member = .) %>%
    mutate(group_size = map_int(`group_member`, length)) %>%
    arrange(desc(group_size))
}

pathway_protein_graph <- shared_pathway_df %>% protein_df_to_graph()
pathway_protein_groups <- pathway_protein_graph %>% graph_to_protein_groups()
pathway_protein_groups %>% str()
#> tibble [7 × 2] (S3: tbl_df/tbl/data.frame)
#>  $ group_member:List of 7
#>   ..$ 2: chr [1:3] "P05362" "P29597" "Q9UMF0"
#>   ..$ 1: chr "O75791"
#>   ..$ 3: chr "P0C6A0"
#>   ..$ 4: chr "Q13445"
#>   ..$ 5: chr "Q6P4F2"
#>   ..$ 6: chr "Q8N9T8"
#>   ..$ 7: chr "Q9BYD3"
#>   ..- attr(*, "dim")= int 7
#>   ..- attr(*, "dimnames")=List of 1
#>   .. ..$ : chr [1:7] "2" "1" "3" "4" ...
#>  $ group_size  : Named int [1:7] 3 1 1 1 1 1 1
#>   ..- attr(*, "names")= chr [1:7] "2" "1" "3" "4" ...
plot(pathway_protein_graph)

Retrieving shared protein-protein interactions

We can complement the information given by pathway ontologies with data of interactions between proteins. For each pair of proteins, we retrieve shared protein-protein interactions, either a direct interaction or an interaction where there’s one mediator protein in the middle (see parameters section).

get_ppi <- function(gene_protein_df, n_intermediate_proteins = 0) {
  endpoint <- "/protein/ppi/pairwise"
  params <- list(
    uniprot_id_list = gene_protein_df %>% pull("uniprot_id") %>% I(),
    n_intermediate_proteins = n_intermediate_proteins
  )
  df <- query_epigraphdb(
    route = endpoint,
    params = params,
    mode = "table",
    method = "POST"
  )

  if (nrow(df) > 0) {
    res_df <- gene_protein_df %>%
      select(protein = uniprot_id) %>%
      left_join(df, by = c("protein"))
  } else {
    res_df <- gene_protein_df %>%
      select(protein = uniprot_id) %>%
      mutate(assoc_protein = NA_character_, path_size = NA_integer_)
  }
  res_df
}
ppi_df <- get_ppi(gene_protein_df, n_intermediate_proteins = PPI_N_INTERMEDIATE_PROTEINS)
ppi_df
#> # A tibble: 77 x 3
#>    protein assoc_protein path_size
#>    <chr>   <chr>             <int>
#>  1 Q13445  <NA>                 NA
#>  2 Q9BYD3  Q8N9T8                2
#>  3 Q9BYD3  Q8N9T8                2
#>  4 Q9BYD3  Q8N9T8                2
#>  5 Q9BYD3  Q8N9T8                2
#>  6 P29597  P05362                2
#>  7 P29597  P05362                2
#>  8 P29597  P05362                2
#>  9 P29597  P05362                2
#> 10 P29597  O75791                2
#> # … with 67 more rows

Like in the query for pathway data, we convert and analyse these results as a graph.

get_shared_ppi <- function(ppi_df) {
  per_row <- function(ppi_df, x, y) {
    res <- ppi_df %>%
      filter(
        `protein` == x,
        `assoc_protein` == y
      ) %>%
      nrow() %>%
      `>`(0)
    res
  }

  pairwise_permutations <- ppi_df %>%
    pull(`protein`) %>%
    unique() %>%
    gtools::permutations(n = length(.), r = 2, v = .)
  shared_ppi_df <- tibble(
    protein = pairwise_permutations[, 1],
    assoc_protein = pairwise_permutations[, 2]
  ) %>%
    mutate(
      shared_ppi = map2_lgl(`protein`, `assoc_protein`, function(x, y) {
        per_row(ppi_df = ppi_df, x = x, y = y)
      }),
      combination = map2_chr(`protein`, `assoc_protein`, function(x, y) {
        comb <- sort(c(x, y))
        paste(comb, collapse = ",")
      }),
      connected = shared_ppi
    )

  shared_ppi_df
}

shared_ppi_df <- get_shared_ppi(ppi_df)
n_ppi_pairs <- shared_ppi_df %>%
  filter(shared_ppi) %>%
  nrow()
print(glue::glue("Num. shared_ppi pairs: {n_ppi_pairs}"))
#> Num. shared_ppi pairs: 10
shared_ppi_df
#> # A tibble: 72 x 5
#>    protein assoc_protein shared_ppi combination   connected
#>    <chr>   <chr>         <lgl>      <chr>         <lgl>    
#>  1 O75791  P05362        TRUE       O75791,P05362 TRUE     
#>  2 O75791  P0C6A0        FALSE      O75791,P0C6A0 FALSE    
#>  3 O75791  P29597        TRUE       O75791,P29597 TRUE     
#>  4 O75791  Q13445        FALSE      O75791,Q13445 FALSE    
#>  5 O75791  Q6P4F2        FALSE      O75791,Q6P4F2 FALSE    
#>  6 O75791  Q8N9T8        FALSE      O75791,Q8N9T8 FALSE    
#>  7 O75791  Q9BYD3        FALSE      O75791,Q9BYD3 FALSE    
#>  8 O75791  Q9UMF0        FALSE      O75791,Q9UMF0 FALSE    
#>  9 P05362  O75791        TRUE       O75791,P05362 TRUE     
#> 10 P05362  P0C6A0        FALSE      P05362,P0C6A0 FALSE    
#> # … with 62 more rows
ppi_protein_graph <- protein_df_to_graph(shared_ppi_df)
ppi_protein_groups <- graph_to_protein_groups(ppi_protein_graph)
ppi_protein_groups %>% str()
#> tibble [5 × 2] (S3: tbl_df/tbl/data.frame)
#>  $ group_member:List of 5
#>   ..$ 1: chr [1:4] "O75791" "P05362" "P29597" "Q9UMF0"
#>   ..$ 5: chr [1:2] "Q8N9T8" "Q9BYD3"
#>   ..$ 2: chr "P0C6A0"
#>   ..$ 3: chr "Q13445"
#>   ..$ 4: chr "Q6P4F2"
#>   ..- attr(*, "dim")= int 5
#>   ..- attr(*, "dimnames")=List of 1
#>   .. ..$ : chr [1:5] "1" "5" "2" "3" ...
#>  $ group_size  : Named int [1:5] 4 2 1 1 1
#>   ..- attr(*, "names")= chr [1:5] "1" "5" "2" "3" ...
plot(ppi_protein_graph)

sessionInfo

sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 16.04.6 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/openblas-base/libblas.so.3
#> LAPACK: /usr/lib/libopenblasp-r0.2.18.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] epigraphdb_0.2.1 igraph_1.2.5     glue_1.4.2       purrr_0.3.4     
#> [5] dplyr_1.0.2      magrittr_1.5    
#> 
#> loaded via a namespace (and not attached):
#>  [1] knitr_1.30         tidyselect_1.1.0   R6_2.4.1           ragg_0.3.1        
#>  [5] rlang_0.4.7        fansi_0.4.1        httr_1.4.2         stringr_1.4.0     
#>  [9] tools_4.0.2        xfun_0.18          utf8_1.1.4         cli_2.0.2         
#> [13] gtools_3.8.2       htmltools_0.5.0    systemfonts_0.3.2  ellipsis_0.3.1    
#> [17] yaml_2.2.1         assertthat_0.2.1   rprojroot_1.3-2    digest_0.6.25     
#> [21] tibble_3.0.3       lifecycle_0.2.0    pkgdown_1.6.1.9000 crayon_1.3.4      
#> [25] vctrs_0.3.4        fs_1.5.0           curl_4.3           memoise_1.1.0     
#> [29] evaluate_0.14      rmarkdown_2.4      stringi_1.5.3      pillar_1.4.6      
#> [33] compiler_4.0.2     desc_1.2.0         generics_0.0.2     backports_1.1.10  
#> [37] jsonlite_1.7.1     pkgconfig_2.0.3