library(here)
library(SummarizedExperiment)
# Load SummarizeExperiment with DTU results
se <- readRDS(here::here("data", "glinos_saturn_dtu.rds"))
# Load exons
exons <- readRDS(here::here("data", "glinos_exons.rds")) 4 Post DTU Analysis
In this chapter, we show how to explore the results from the DTU analysis performed using satuRn (see Statistical testing chapter).
Load needed objects computed in Statistical testing chapter
Extract test results
sig_res <- rowData(se)[["fitDTUResult_WT_vs_KD"]] |>
tibble::as_tibble() |>
dplyr::bind_cols(as.data.frame(rowData(se)[,1:4])) |>
dplyr::filter(empirical_FDR < .1) |>
dplyr::select(gene_id, isoform_id, symbol, estimates, empirical_pval, empirical_FDR) |>
dplyr::arrange(empirical_pval)
sig_res# A tibble: 77 × 6
gene_id isoform_id symbol estimates empirical_pval empirical_FDR
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 ENSG00000196923.13 ENST0000035… PDLIM7 3.60 0.00000937 0.00959
2 ENSG00000196923.13 ENST0000035… PDLIM7 -3.60 0.0000111 0.00959
3 ENSG00000134285.10 ENST0000055… FKBP11 1.98 0.0000126 0.00959
4 ENSG00000134285.10 ENST0000055… FKBP11 -1.98 0.0000149 0.00959
5 ENSG00000198467.13 ENST0000037… TPM2 2.63 0.0000433 0.0195
6 ENSG00000138326.18 ENST0000037… RPS24 1.60 0.0000456 0.0195
7 ENSG00000138326.18 ENST0000036… RPS24 -1.28 0.0000640 0.0196
8 ENSG00000112305.14 ENST0000031… SMAP1 4.18 0.0000695 0.0196
9 ENSG00000115310.17 ENST0000031… RTN4 2.70 0.0000809 0.0196
10 ENSG00000112305.14 ENST0000037… SMAP1 -4.18 0.0000811 0.0196
# ℹ 67 more rows
Obtain a GRanges of all the exons present in significant DTU. First, we flatten-out the GRangesList of all the transcripts and include a column to identify if an exon is internal (i.e. is not the first or last in the isoform) or not.
library(plyranges)
flat_exons <- unlist(exons) #GRanges
#include 'isoform_id' column
flat_exons$isoform_id <- names(flat_exons)
#include 'internal' column
tab <- table(flat_exons$isoform_id)
flat_exons <- flat_exons |>
mutate(
nexons = tab[isoform_id],
internal = exon_rank > 1 & exon_rank < nexons
)
flat_exons GRanges object with 868829 ranges and 7 metadata columns:
seqnames ranges strand |
<Rle> <IRanges> <Rle> |
ENSG00000237491.8-c0e70edb chr1 785782-787672 + |
ENST00000326734.2 chr1 817713-818202 + |
ENST00000326734.2 chr1 818723-821358 + |
ENSG00000228794.8-5a74a94f chr1 827608-827775 + |
ENSG00000228794.8-5a74a94f chr1 829003-829104 + |
... ... ... ... .
chrM:12000_ENSG00000198786.2-ae9cf68f chrM 12266-15326 + |
chrM:12000_ENSG00000198786.2-3aadce2b chrM 12266-15707 + |
chrM:1000-d5c15fee chrM 1602-5511 - |
chrM:3000-ed670af4 chrM 3305-5121 - |
chrM:3000-53ee06de chrM 3305-5511 - |
exon_id exon_name exon_rank
<integer> <character> <integer>
ENSG00000237491.8-c0e70edb 1 <NA> 1
ENST00000326734.2 2 <NA> 1
ENST00000326734.2 3 <NA> 2
ENSG00000228794.8-5a74a94f 4 <NA> 1
ENSG00000228794.8-5a74a94f 9 <NA> 2
... ... ... ...
chrM:12000_ENSG00000198786.2-ae9cf68f 227899 <NA> 1
chrM:12000_ENSG00000198786.2-3aadce2b 227900 <NA> 1
chrM:1000-d5c15fee 227901 <NA> 1
chrM:3000-ed670af4 227902 <NA> 1
chrM:3000-53ee06de 227903 <NA> 1
isoform_id
<character>
ENSG00000237491.8-c0e70edb ENSG00000237491.8-c0..
ENST00000326734.2 ENST00000326734.2
ENST00000326734.2 ENST00000326734.2
ENSG00000228794.8-5a74a94f ENSG00000228794.8-5a..
ENSG00000228794.8-5a74a94f ENSG00000228794.8-5a..
... ...
chrM:12000_ENSG00000198786.2-ae9cf68f chrM:12000_ENSG00000..
chrM:12000_ENSG00000198786.2-3aadce2b chrM:12000_ENSG00000..
chrM:1000-d5c15fee chrM:1000-d5c15fee
chrM:3000-ed670af4 chrM:3000-ed670af4
chrM:3000-53ee06de chrM:3000-53ee06de
nexons.Var1
<factor>
ENSG00000237491.8-c0e70edb ENSG00000237491.8-c0e70edb
ENST00000326734.2 ENST00000326734.2
ENST00000326734.2 ENST00000326734.2
ENSG00000228794.8-5a74a94f ENSG00000228794.8-5a74a94f
ENSG00000228794.8-5a74a94f ENSG00000228794.8-5a74a94f
... ...
chrM:12000_ENSG00000198786.2-ae9cf68f chrM:12000_ENSG00000198786.2-ae9cf68f
chrM:12000_ENSG00000198786.2-3aadce2b chrM:12000_ENSG00000198786.2-3aadce2b
chrM:1000-d5c15fee chrM:1000-d5c15fee
chrM:3000-ed670af4 chrM:3000-ed670af4
chrM:3000-53ee06de chrM:3000-53ee06de
nexons.Freq internal
<integer> <array>
ENSG00000237491.8-c0e70edb 1 FALSE
ENST00000326734.2 2 FALSE
ENST00000326734.2 2 FALSE
ENSG00000228794.8-5a74a94f 4 FALSE
ENSG00000228794.8-5a74a94f 4 TRUE
... ... ...
chrM:12000_ENSG00000198786.2-ae9cf68f 1 FALSE
chrM:12000_ENSG00000198786.2-3aadce2b 1 FALSE
chrM:1000-d5c15fee 1 FALSE
chrM:3000-ed670af4 1 FALSE
chrM:3000-53ee06de 1 FALSE
-------
seqinfo: 25 sequences (1 circular) from an unspecified genome; no seqlengths
To perform exon-level detection events, we include a metadata column that will tell us if a given exon has a - or + coefficient (from satuRn analysis).
#filter to keep only significant DTU transcripts
flat_sig_exons <- flat_exons |>
filter( isoform_id %in% sig_res$isoform_id)
#include coef +/- column from the DTU analysis saturn
flat_sig_exons$coef <- sig_res$estimates[match(names(flat_sig_exons), sig_res$isoform_id)]
flat_sig_exons$sign <- sign(flat_sig_exons$coef)
#include gene symbol for each transcript name
flat_sig_exons$symbol <- sig_res$symbol[match(names(flat_sig_exons), sig_res$isoform_id)]
flat_sig_exonsGRanges object with 921 ranges and 10 metadata columns:
seqnames ranges strand | exon_id
<Rle> <IRanges> <Rle> | <integer>
ENST00000315554.12-1 chr1 22052707-22052742 + | 1503
ENST00000315554.12-1 chr1 22078429-22078583 + | 1505
ENST00000315554.12-1 chr1 22081722-22081794 + | 1507
ENST00000315554.12-1 chr1 22086439-22086548 + | 1508
ENST00000315554.12-1 chr1 22086669-22086866 + | 1509
... ... ... ... . ...
ENST00000216879.8 chr20 1445666-1445830 - | 211007
ENST00000216879.8 chr20 1443511-1443911 - | 211006
ENST00000451301.5 chrX 103686631-103686705 - | 226429
ENST00000451301.5 chrX 103685171-103685260 - | 226428
ENST00000451301.5 chrX 103675496-103677051 - | 226423
exon_name exon_rank isoform_id
<character> <integer> <character>
ENST00000315554.12-1 <NA> 1 ENST00000315554.12-1
ENST00000315554.12-1 <NA> 2 ENST00000315554.12-1
ENST00000315554.12-1 <NA> 3 ENST00000315554.12-1
ENST00000315554.12-1 <NA> 4 ENST00000315554.12-1
ENST00000315554.12-1 <NA> 5 ENST00000315554.12-1
... ... ... ...
ENST00000216879.8 <NA> 8 ENST00000216879.8
ENST00000216879.8 <NA> 9 ENST00000216879.8
ENST00000451301.5 <NA> 1 ENST00000451301.5
ENST00000451301.5 <NA> 2 ENST00000451301.5
ENST00000451301.5 <NA> 3 ENST00000451301.5
nexons.Var1 nexons.Freq internal coef
<factor> <integer> <array> <numeric>
ENST00000315554.12-1 ENST00000315554.12-1 6 FALSE -1.84845
ENST00000315554.12-1 ENST00000315554.12-1 6 TRUE -1.84845
ENST00000315554.12-1 ENST00000315554.12-1 6 TRUE -1.84845
ENST00000315554.12-1 ENST00000315554.12-1 6 TRUE -1.84845
ENST00000315554.12-1 ENST00000315554.12-1 6 TRUE -1.84845
... ... ... ... ...
ENST00000216879.8 ENST00000216879.8 9 TRUE 1.30549
ENST00000216879.8 ENST00000216879.8 9 FALSE 1.30549
ENST00000451301.5 ENST00000451301.5 3 FALSE -2.08015
ENST00000451301.5 ENST00000451301.5 3 TRUE -2.08015
ENST00000451301.5 ENST00000451301.5 3 FALSE -2.08015
sign symbol
<numeric> <character>
ENST00000315554.12-1 -1 CDC42
ENST00000315554.12-1 -1 CDC42
ENST00000315554.12-1 -1 CDC42
ENST00000315554.12-1 -1 CDC42
ENST00000315554.12-1 -1 CDC42
... ... ...
ENST00000216879.8 1 NSFL1C
ENST00000216879.8 1 NSFL1C
ENST00000451301.5 -1 MORF4L2
ENST00000451301.5 -1 MORF4L2
ENST00000451301.5 -1 MORF4L2
-------
seqinfo: 25 sequences (1 circular) from an unspecified genome; no seqlengths
4.1 Detect skipped-exon events
We are looking to detect exons that were skipped-out due to PTBP1 (down-regulated exons). A transcript with a positive coefficient will be more likely to be expressed when the protein is abundant, and a transcript with a negative coefficient will be less likely to be expressed when the protein is abundant.
Exons that appear in transcripts with negative coefficients and that do not appear in transcripts with positive coefficients may be down-regulated by the protein.
We want to get exons that are present in the KD condition (sign -1) that are not present in the WD condition (sign -1)
library(dplyr)
flat_sig_exons <- flat_sig_exons |>
dplyr::mutate(key = paste0(isoform_id, "-", exon_rank))
# 1) split + vs – coef sign
pos_exons <- flat_sig_exons |> filter(sign == 1)
neg_exons <- flat_sig_exons |> filter(sign == -1)
# 2) filter non overlaps
downreg_candidates <- neg_exons |>
filter_by_non_overlaps_directed(pos_exons) |>
mutate(SE = TRUE) |>
filter(internal == TRUE)
# 3) detect left and right exons from candidates
left_keys <- paste0(downreg_candidates$isoform_id, "-",
downreg_candidates$exon_rank-1)
left_exons <- flat_sig_exons |>
filter(key %in% left_keys)
right_keys <- paste0(downreg_candidates$isoform_id, "-",
downreg_candidates$exon_rank+1)
right_exons <- flat_sig_exons |>
filter(key %in% right_keys)
# 4) filter in candidates with left and right present in pos_exons
downreg_candidates <- downreg_candidates |>
mutate(left_and_right =
left_exons %in% pos_exons &
right_exons %in% pos_exons
)
downreg_exons <- downreg_candidates |> filter(left_and_right == TRUE)
# summary
length(downreg_exons) # n of spliced exons [1] 19
length(unique(downreg_exons$symbol)) # n of dif genes[1] 16
List of genes that had skipped-exon events
downreg_exons$symbol |> unique() [1] "TPM3" "SKP1" "PDLIM7" "SMAP1" "TPM2" "RPS24" "DPF2" "EIF4G2"
[9] "ISCU" "FERMT2" "PKM" "RNPS1" "EXOC7" "TECR" "AXL" "NSFL1C"
4.2 Analyse upstream regions of the down-regulated exons
We know that PTPB1 binds to intronic regions upstream to the skipped exons, regions 15-20 rich in U and C.
With the selected down-regulated exon candidates, we can analyze their upstream region and study their bp composition to look for PTPB1 signals.
Get upstream sequences.
library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg38)
#get ranges of 100bp upstream region
width_upstream <- 100
upstr_downreg_exons <- downreg_exons |> #GRanges
flank_upstream(width = width_upstream)
# get sequence from GRanges
seq_downreg_exons <- Hsapiens |> ## RNAStringSet object
getSeq(upstr_downreg_exons) |>
RNAStringSet()
seq_downreg_exonsRNAStringSet object of length 19:
width seq names
[1] 100 UCACACUUUCCUAAUUUCUAAUG...UGUGGUUCCUGUGCCUGUCCAG ENST00000328159.8
[2] 100 CUCUGAAAUGAAGCUUUUAAGUA...AUAUGGUCUUGCAUUGCCUCAG ENST00000522855.5
[3] 100 GCCUUGUCCCCGGCUGUUCUGAC...GUCUGUCUUAUUUCCUUCACAG ENST00000359895.6
[4] 100 AUCUCAAAAAUAUACAGGGAUUU...ACUUUUCACCUGUACUCCACAG ENST00000370455.7
[5] 100 CUUCUCUUCUCUCCUCCACCCGU...CCCCGCCACACACCCCCUGCAG ENST00000329305.6
... ... ...
[15] 100 UUGACAUGCCGUUUCUAACCUGU...UGCACUGCCUUUCACUCCGUAG ENST00000607838.5
[16] 100 CUGGGUGCGGGGCCUGGGGGGCA...CUCUGUGUGGCUCUGGUGACAG ENST00000589210.5
[17] 100 CCUGGUCCAAGCACCUGAGUCUC...CUUACUGUUCCUCUGUCCCCAG ENST00000598987.5
[18] 100 CCUCACUCCCUUACCCGUGCCAC...UCUCUGUCCUUUCUUCUCACAG ENST00000301178.8
[19] 100 GCUCCUGGUGCUGUGUCACACCA...AACCUCACGGCUCUGACUUAAG ENST00000476071.5
Create ranges using sliding window of with 10 with 5 bp overlap
window_width <- 10
overlap <- 5
step <- window_width - overlap
n_windows <- ceiling((width_upstream - window_width) / step) + 1
# create GRanges with the defined window ranges
windows <- upstr_downreg_exons |> slide_ranges( width = window_width, # GRanges
step = step) # divide into windows
windows |> as_tibble()# A tibble: 361 × 6
seqnames start end width strand partition
<fct> <int> <int> <int> <fct> <int>
1 chr1 154169384 154169393 10 - 1
2 chr1 154169389 154169398 10 - 1
3 chr1 154169394 154169403 10 - 1
4 chr1 154169399 154169408 10 - 1
5 chr1 154169404 154169413 10 - 1
6 chr1 154169409 154169418 10 - 1
7 chr1 154169414 154169423 10 - 1
8 chr1 154169419 154169428 10 - 1
9 chr1 154169424 154169433 10 - 1
10 chr1 154169429 154169438 10 - 1
# ℹ 351 more rows
Get the sequence for each window
seq_windows <- Hsapiens |>
getSeq(windows) |>
RNAStringSet() # RNAStringSet
# The RNAStringSet result has n_windows * (# of exons) rows.
head(seq_windows)RNAStringSet object of length 6:
width seq names
[1] 10 GCCUGUCCAG ENST00000328159.8
[2] 10 CCUGUGCCUG ENST00000328159.8
[3] 10 UGGUUCCUGU ENST00000328159.8
[4] 10 CCCUGUGGUU ENST00000328159.8
[5] 10 UUCCUCCCUG ENST00000328159.8
[6] 10 CCUCAUUCCU ENST00000328159.8
Calculate the oligonucleotide frequency (A/C/G/U) per window
bp_counts <- oligonucleotideFrequency(seq_windows,
width = 1) #1-mer
windows_bp <- bp_counts / rowSums(bp_counts)
windows_bp |> as_tibble()# A tibble: 361 × 4
A C G U
<dbl> <dbl> <dbl> <dbl>
1 0.1 0.4 0.3 0.2
2 0 0.4 0.3 0.3
3 0 0.2 0.3 0.5
4 0 0.3 0.3 0.4
5 0 0.5 0.1 0.4
6 0.1 0.5 0 0.4
7 0.1 0.5 0.1 0.3
8 0.1 0.5 0.1 0.3
9 0.1 0.5 0 0.4
10 0.3 0.2 0.1 0.4
# ℹ 351 more rows
4.3 Run SPLain app
SPLain is a Shiny app that allows the exploration and visualization of DTU results. You can find the repository at https://github.com/beamimc/SPLain, and follow the installation instructions. Basically, we need to clone the repo and call the app providing the path of the cloned repo.
splain_path <- "your_path/SPLain" # path to dir with the cloned SPLain repo
app_dir <- file.path(splain_path, "app")
# load the SPLain app
source(file.path(app_dir, "app.R"))
# Run the Shiny app with your data and app directory
shiny::runApp(
app(
se = se,
exons = exons,
app_dir = app_dir
)
)