4  Post DTU Analysis

In this chapter, we show how to explore the results from the DTU analysis performed using satuRn (see Statistical testing chapter).

Load needed objects computed in Statistical testing chapter

library(here)
library(SummarizedExperiment)
# Load SummarizeExperiment with DTU results
se <- readRDS(here::here("data", "glinos_saturn_dtu.rds"))
# Load exons 
exons <- readRDS(here::here("data", "glinos_exons.rds")) 

Extract test results

sig_res <- rowData(se)[["fitDTUResult_WT_vs_KD"]] |>
  tibble::as_tibble() |>
  dplyr::bind_cols(as.data.frame(rowData(se)[,1:4])) |>
  dplyr::filter(empirical_FDR < .1) |>
  dplyr::select(gene_id, isoform_id, symbol, estimates, empirical_pval, empirical_FDR) |>
  dplyr::arrange(empirical_pval)

sig_res
# A tibble: 77 × 6
   gene_id            isoform_id   symbol estimates empirical_pval empirical_FDR
   <chr>              <chr>        <chr>      <dbl>          <dbl>         <dbl>
 1 ENSG00000196923.13 ENST0000035… PDLIM7      3.60     0.00000937       0.00959
 2 ENSG00000196923.13 ENST0000035… PDLIM7     -3.60     0.0000111        0.00959
 3 ENSG00000134285.10 ENST0000055… FKBP11      1.98     0.0000126        0.00959
 4 ENSG00000134285.10 ENST0000055… FKBP11     -1.98     0.0000149        0.00959
 5 ENSG00000198467.13 ENST0000037… TPM2        2.63     0.0000433        0.0195 
 6 ENSG00000138326.18 ENST0000037… RPS24       1.60     0.0000456        0.0195 
 7 ENSG00000138326.18 ENST0000036… RPS24      -1.28     0.0000640        0.0196 
 8 ENSG00000112305.14 ENST0000031… SMAP1       4.18     0.0000695        0.0196 
 9 ENSG00000115310.17 ENST0000031… RTN4        2.70     0.0000809        0.0196 
10 ENSG00000112305.14 ENST0000037… SMAP1      -4.18     0.0000811        0.0196 
# ℹ 67 more rows

Obtain a GRanges of all the exons present in significant DTU. First, we flatten-out the GRangesList of all the transcripts and include a column to identify if an exon is internal (i.e. is not the first or last in the isoform) or not.

library(plyranges)

flat_exons <- unlist(exons) #GRanges

#include 'isoform_id' column 
flat_exons$isoform_id <- names(flat_exons)

#include 'internal' column
tab <- table(flat_exons$isoform_id)
flat_exons <- flat_exons |> 
                      mutate(
                        nexons = tab[isoform_id], 
                        internal = exon_rank > 1 & exon_rank < nexons
                      )
flat_exons 
GRanges object with 868829 ranges and 7 metadata columns:
                                        seqnames        ranges strand |
                                           <Rle>     <IRanges>  <Rle> |
             ENSG00000237491.8-c0e70edb     chr1 785782-787672      + |
                      ENST00000326734.2     chr1 817713-818202      + |
                      ENST00000326734.2     chr1 818723-821358      + |
             ENSG00000228794.8-5a74a94f     chr1 827608-827775      + |
             ENSG00000228794.8-5a74a94f     chr1 829003-829104      + |
                                    ...      ...           ...    ... .
  chrM:12000_ENSG00000198786.2-ae9cf68f     chrM   12266-15326      + |
  chrM:12000_ENSG00000198786.2-3aadce2b     chrM   12266-15707      + |
                     chrM:1000-d5c15fee     chrM     1602-5511      - |
                     chrM:3000-ed670af4     chrM     3305-5121      - |
                     chrM:3000-53ee06de     chrM     3305-5511      - |
                                          exon_id   exon_name exon_rank
                                        <integer> <character> <integer>
             ENSG00000237491.8-c0e70edb         1        <NA>         1
                      ENST00000326734.2         2        <NA>         1
                      ENST00000326734.2         3        <NA>         2
             ENSG00000228794.8-5a74a94f         4        <NA>         1
             ENSG00000228794.8-5a74a94f         9        <NA>         2
                                    ...       ...         ...       ...
  chrM:12000_ENSG00000198786.2-ae9cf68f    227899        <NA>         1
  chrM:12000_ENSG00000198786.2-3aadce2b    227900        <NA>         1
                     chrM:1000-d5c15fee    227901        <NA>         1
                     chrM:3000-ed670af4    227902        <NA>         1
                     chrM:3000-53ee06de    227903        <NA>         1
                                                    isoform_id
                                                   <character>
             ENSG00000237491.8-c0e70edb ENSG00000237491.8-c0..
                      ENST00000326734.2      ENST00000326734.2
                      ENST00000326734.2      ENST00000326734.2
             ENSG00000228794.8-5a74a94f ENSG00000228794.8-5a..
             ENSG00000228794.8-5a74a94f ENSG00000228794.8-5a..
                                    ...                    ...
  chrM:12000_ENSG00000198786.2-ae9cf68f chrM:12000_ENSG00000..
  chrM:12000_ENSG00000198786.2-3aadce2b chrM:12000_ENSG00000..
                     chrM:1000-d5c15fee     chrM:1000-d5c15fee
                     chrM:3000-ed670af4     chrM:3000-ed670af4
                     chrM:3000-53ee06de     chrM:3000-53ee06de
                                                                  nexons.Var1
                                                                     <factor>
             ENSG00000237491.8-c0e70edb            ENSG00000237491.8-c0e70edb
                      ENST00000326734.2            ENST00000326734.2         
                      ENST00000326734.2            ENST00000326734.2         
             ENSG00000228794.8-5a74a94f            ENSG00000228794.8-5a74a94f
             ENSG00000228794.8-5a74a94f            ENSG00000228794.8-5a74a94f
                                    ...                                   ...
  chrM:12000_ENSG00000198786.2-ae9cf68f chrM:12000_ENSG00000198786.2-ae9cf68f
  chrM:12000_ENSG00000198786.2-3aadce2b chrM:12000_ENSG00000198786.2-3aadce2b
                     chrM:1000-d5c15fee chrM:1000-d5c15fee                   
                     chrM:3000-ed670af4 chrM:3000-ed670af4                   
                     chrM:3000-53ee06de chrM:3000-53ee06de                   
                                        nexons.Freq internal
                                          <integer>  <array>
             ENSG00000237491.8-c0e70edb           1    FALSE
                      ENST00000326734.2           2    FALSE
                      ENST00000326734.2           2    FALSE
             ENSG00000228794.8-5a74a94f           4    FALSE
             ENSG00000228794.8-5a74a94f           4     TRUE
                                    ...         ...      ...
  chrM:12000_ENSG00000198786.2-ae9cf68f           1    FALSE
  chrM:12000_ENSG00000198786.2-3aadce2b           1    FALSE
                     chrM:1000-d5c15fee           1    FALSE
                     chrM:3000-ed670af4           1    FALSE
                     chrM:3000-53ee06de           1    FALSE
  -------
  seqinfo: 25 sequences (1 circular) from an unspecified genome; no seqlengths

To perform exon-level detection events, we include a metadata column that will tell us if a given exon has a - or + coefficient (from satuRn analysis).

#filter to keep only significant DTU transcripts 
flat_sig_exons <- flat_exons |>
                       filter( isoform_id %in% sig_res$isoform_id)

#include coef +/- column from the DTU analysis saturn 
flat_sig_exons$coef <- sig_res$estimates[match(names(flat_sig_exons), sig_res$isoform_id)]
flat_sig_exons$sign <- sign(flat_sig_exons$coef)

#include gene symbol for each transcript name 
flat_sig_exons$symbol <- sig_res$symbol[match(names(flat_sig_exons), sig_res$isoform_id)]

flat_sig_exons
GRanges object with 921 ranges and 10 metadata columns:
                       seqnames              ranges strand |   exon_id
                          <Rle>           <IRanges>  <Rle> | <integer>
  ENST00000315554.12-1     chr1   22052707-22052742      + |      1503
  ENST00000315554.12-1     chr1   22078429-22078583      + |      1505
  ENST00000315554.12-1     chr1   22081722-22081794      + |      1507
  ENST00000315554.12-1     chr1   22086439-22086548      + |      1508
  ENST00000315554.12-1     chr1   22086669-22086866      + |      1509
                   ...      ...                 ...    ... .       ...
     ENST00000216879.8    chr20     1445666-1445830      - |    211007
     ENST00000216879.8    chr20     1443511-1443911      - |    211006
     ENST00000451301.5     chrX 103686631-103686705      - |    226429
     ENST00000451301.5     chrX 103685171-103685260      - |    226428
     ENST00000451301.5     chrX 103675496-103677051      - |    226423
                         exon_name exon_rank           isoform_id
                       <character> <integer>          <character>
  ENST00000315554.12-1        <NA>         1 ENST00000315554.12-1
  ENST00000315554.12-1        <NA>         2 ENST00000315554.12-1
  ENST00000315554.12-1        <NA>         3 ENST00000315554.12-1
  ENST00000315554.12-1        <NA>         4 ENST00000315554.12-1
  ENST00000315554.12-1        <NA>         5 ENST00000315554.12-1
                   ...         ...       ...                  ...
     ENST00000216879.8        <NA>         8    ENST00000216879.8
     ENST00000216879.8        <NA>         9    ENST00000216879.8
     ENST00000451301.5        <NA>         1    ENST00000451301.5
     ENST00000451301.5        <NA>         2    ENST00000451301.5
     ENST00000451301.5        <NA>         3    ENST00000451301.5
                                nexons.Var1 nexons.Freq internal      coef
                                   <factor>   <integer>  <array> <numeric>
  ENST00000315554.12-1 ENST00000315554.12-1           6    FALSE  -1.84845
  ENST00000315554.12-1 ENST00000315554.12-1           6     TRUE  -1.84845
  ENST00000315554.12-1 ENST00000315554.12-1           6     TRUE  -1.84845
  ENST00000315554.12-1 ENST00000315554.12-1           6     TRUE  -1.84845
  ENST00000315554.12-1 ENST00000315554.12-1           6     TRUE  -1.84845
                   ...                  ...         ...      ...       ...
     ENST00000216879.8    ENST00000216879.8           9     TRUE   1.30549
     ENST00000216879.8    ENST00000216879.8           9    FALSE   1.30549
     ENST00000451301.5    ENST00000451301.5           3    FALSE  -2.08015
     ENST00000451301.5    ENST00000451301.5           3     TRUE  -2.08015
     ENST00000451301.5    ENST00000451301.5           3    FALSE  -2.08015
                            sign      symbol
                       <numeric> <character>
  ENST00000315554.12-1        -1       CDC42
  ENST00000315554.12-1        -1       CDC42
  ENST00000315554.12-1        -1       CDC42
  ENST00000315554.12-1        -1       CDC42
  ENST00000315554.12-1        -1       CDC42
                   ...       ...         ...
     ENST00000216879.8         1      NSFL1C
     ENST00000216879.8         1      NSFL1C
     ENST00000451301.5        -1     MORF4L2
     ENST00000451301.5        -1     MORF4L2
     ENST00000451301.5        -1     MORF4L2
  -------
  seqinfo: 25 sequences (1 circular) from an unspecified genome; no seqlengths

4.1 Detect skipped-exon events

We are looking to detect exons that were skipped-out due to PTBP1 (down-regulated exons). A transcript with a positive coefficient will be more likely to be expressed when the protein is abundant, and a transcript with a negative coefficient will be less likely to be expressed when the protein is abundant.

Exons that appear in transcripts with negative coefficients and that do not appear in transcripts with positive coefficients may be down-regulated by the protein.

We want to get exons that are present in the KD condition (sign -1) that are not present in the WD condition (sign -1)

library(dplyr)
flat_sig_exons <- flat_sig_exons |>
    dplyr::mutate(key = paste0(isoform_id, "-", exon_rank))

# 1) split + vs – coef sign
pos_exons  <- flat_sig_exons |> filter(sign ==  1)
neg_exons <- flat_sig_exons |> filter(sign ==  -1)

# 2) filter non overlaps 
downreg_candidates <- neg_exons |>
  filter_by_non_overlaps_directed(pos_exons) |>
  mutate(SE = TRUE) |>
  filter(internal == TRUE)

# 3) detect left and right exons from candidates
left_keys <- paste0(downreg_candidates$isoform_id, "-", 
                    downreg_candidates$exon_rank-1)
left_exons <- flat_sig_exons |>
  filter(key %in% left_keys)

right_keys <- paste0(downreg_candidates$isoform_id, "-", 
                     downreg_candidates$exon_rank+1)
right_exons <- flat_sig_exons |>
  filter(key %in% right_keys)

# 4) filter in candidates with left and right present in pos_exons
downreg_candidates <-  downreg_candidates |>
  mutate(left_and_right =
                 left_exons %in% pos_exons &
                 right_exons %in% pos_exons
  )
downreg_exons  <- downreg_candidates |> filter(left_and_right == TRUE)

# summary
length(downreg_exons) # n of spliced exons 
[1] 19
length(unique(downreg_exons$symbol)) # n of dif genes
[1] 16

List of genes that had skipped-exon events

downreg_exons$symbol |> unique()
 [1] "TPM3"   "SKP1"   "PDLIM7" "SMAP1"  "TPM2"   "RPS24"  "DPF2"   "EIF4G2"
 [9] "ISCU"   "FERMT2" "PKM"    "RNPS1"  "EXOC7"  "TECR"   "AXL"    "NSFL1C"

4.2 Analyse upstream regions of the down-regulated exons

We know that PTPB1 binds to intronic regions upstream to the skipped exons, regions 15-20 rich in U and C.
With the selected down-regulated exon candidates, we can analyze their upstream region and study their bp composition to look for PTPB1 signals.

Get upstream sequences.

library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg38)

#get ranges of 100bp upstream region
width_upstream <- 100
upstr_downreg_exons <- downreg_exons |>  #GRanges
        flank_upstream(width = width_upstream) 

# get sequence from GRanges 
seq_downreg_exons <-  Hsapiens |>  ## RNAStringSet object
                       getSeq(upstr_downreg_exons) |>
                       RNAStringSet() 

seq_downreg_exons
RNAStringSet object of length 19:
     width seq                                              names               
 [1]   100 UCACACUUUCCUAAUUUCUAAUG...UGUGGUUCCUGUGCCUGUCCAG ENST00000328159.8
 [2]   100 CUCUGAAAUGAAGCUUUUAAGUA...AUAUGGUCUUGCAUUGCCUCAG ENST00000522855.5
 [3]   100 GCCUUGUCCCCGGCUGUUCUGAC...GUCUGUCUUAUUUCCUUCACAG ENST00000359895.6
 [4]   100 AUCUCAAAAAUAUACAGGGAUUU...ACUUUUCACCUGUACUCCACAG ENST00000370455.7
 [5]   100 CUUCUCUUCUCUCCUCCACCCGU...CCCCGCCACACACCCCCUGCAG ENST00000329305.6
 ...   ... ...
[15]   100 UUGACAUGCCGUUUCUAACCUGU...UGCACUGCCUUUCACUCCGUAG ENST00000607838.5
[16]   100 CUGGGUGCGGGGCCUGGGGGGCA...CUCUGUGUGGCUCUGGUGACAG ENST00000589210.5
[17]   100 CCUGGUCCAAGCACCUGAGUCUC...CUUACUGUUCCUCUGUCCCCAG ENST00000598987.5
[18]   100 CCUCACUCCCUUACCCGUGCCAC...UCUCUGUCCUUUCUUCUCACAG ENST00000301178.8
[19]   100 GCUCCUGGUGCUGUGUCACACCA...AACCUCACGGCUCUGACUUAAG ENST00000476071.5

Create ranges using sliding window of with 10 with 5 bp overlap

window_width <- 10
overlap <- 5
step <-  window_width - overlap 
n_windows <- ceiling((width_upstream - window_width) / step) + 1

# create GRanges with the defined window ranges
windows <- upstr_downreg_exons |> slide_ranges( width = window_width,  # GRanges
                                         step = step) # divide into windows
windows |> as_tibble()
# A tibble: 361 × 6
   seqnames     start       end width strand partition
   <fct>        <int>     <int> <int> <fct>      <int>
 1 chr1     154169384 154169393    10 -              1
 2 chr1     154169389 154169398    10 -              1
 3 chr1     154169394 154169403    10 -              1
 4 chr1     154169399 154169408    10 -              1
 5 chr1     154169404 154169413    10 -              1
 6 chr1     154169409 154169418    10 -              1
 7 chr1     154169414 154169423    10 -              1
 8 chr1     154169419 154169428    10 -              1
 9 chr1     154169424 154169433    10 -              1
10 chr1     154169429 154169438    10 -              1
# ℹ 351 more rows

Get the sequence for each window

seq_windows <-  Hsapiens |>
  getSeq(windows) |>
  RNAStringSet()  # RNAStringSet
# The RNAStringSet result has n_windows * (# of exons) rows.
head(seq_windows)
RNAStringSet object of length 6:
    width seq                                               names               
[1]    10 GCCUGUCCAG                                        ENST00000328159.8
[2]    10 CCUGUGCCUG                                        ENST00000328159.8
[3]    10 UGGUUCCUGU                                        ENST00000328159.8
[4]    10 CCCUGUGGUU                                        ENST00000328159.8
[5]    10 UUCCUCCCUG                                        ENST00000328159.8
[6]    10 CCUCAUUCCU                                        ENST00000328159.8

Calculate the oligonucleotide frequency (A/C/G/U) per window

bp_counts <- oligonucleotideFrequency(seq_windows, 
                                      width = 1) #1-mer
windows_bp <- bp_counts / rowSums(bp_counts)
windows_bp |> as_tibble()
# A tibble: 361 × 4
       A     C     G     U
   <dbl> <dbl> <dbl> <dbl>
 1   0.1   0.4   0.3   0.2
 2   0     0.4   0.3   0.3
 3   0     0.2   0.3   0.5
 4   0     0.3   0.3   0.4
 5   0     0.5   0.1   0.4
 6   0.1   0.5   0     0.4
 7   0.1   0.5   0.1   0.3
 8   0.1   0.5   0.1   0.3
 9   0.1   0.5   0     0.4
10   0.3   0.2   0.1   0.4
# ℹ 351 more rows

4.3 Run SPLain app

SPLain is a Shiny app that allows the exploration and visualization of DTU results. You can find the repository at https://github.com/beamimc/SPLain, and follow the installation instructions. Basically, we need to clone the repo and call the app providing the path of the cloned repo.

splain_path <- "your_path/SPLain" # path to dir with the cloned SPLain repo
app_dir <- file.path(splain_path, "app")

# load the SPLain app 
source(file.path(app_dir, "app.R"))

# Run the Shiny app with your data and app directory
shiny::runApp(
  app(
    se = se,
    exons = exons,
    app_dir = app_dir
  )
)