library(here)
# https://drive.google.com/file/d/1ATHgwFlIMmI651C5TYtiBxm2atTvbBYn/view?usp=sharing
# counts
url <- "https://drive.google.com/uc?export=download&id=1ATHgwFlIMmI651C5TYtiBxm2atTvbBYn"
filename <- "quantification_flair_filter.counts.txt.gz"
path <- here("data", filename)
if (!file.exists(path)) {
download.file(url, path)
}
# GTF
url <- "https://drive.google.com/uc?export=download&id=1S0TRXoXsfnXwJ0Dh5bLqVmyvFAAALFMU"
filename <- "flair_filter_transcripts.gtf.gz"
path <- here("data", filename)
if (!file.exists(path)) {
download.file(url, path)
}
# metadata
url <- "https://drive.google.com/uc?export=download&id=1LpYPXXhMtUV3vMG7SMLV69-JxZfc59o2"
filename <- "glinos_metadata.csv"
path <- here("data", filename)
if (!file.exists(path)) {
download.file(url, path)
}3 Statistical testing
In this chapter we show differential gene expression (DGE) using DESeq2 and differential transcript usage (DTU) using satuRn.
Most of the code here involves preparing the R objects for the methods. This is a bit complicated because we are importing data from files that were associated with a paper. I (Mike) would like to instead quantify this data with oarfish (with a current GENCODE + some novel transcripts) so we can demonstrate how easy it is to import with tximeta and use in R/Bioconductor pipelines. For now there are a lot of chunks that deal with data import manually.
Downloading Glinos et al. dataset (GTEx, PTBP1 knockdown)
Assembling the object manually from downloaded files.
library(readr)
library(dplyr)
library(stringr)
counts <- read_delim(here("data", "quantification_flair_filter.counts.txt.gz"))
meta <- read_delim(here("data", "glinos_metadata.csv"))
colnames(meta)[10] <- "total_reads"
meta <- meta |>
mutate(condition = str_extract(sample_id, "(ctrl|exp)")) |>
select(sample_id, sample_name, condition, contains("read") | contains("3_prime")) |>
arrange(condition, sample_name)
rnames <- counts$transcript
counts <- as.matrix(counts[,meta$sample_id]) # subset to the knockdown
rownames(counts) <- rnameslibrary(SummarizedExperiment)
se <- SummarizedExperiment(list(counts = counts),
colData = meta)
#saveRDS(se, file=here("data", "glinos_se.rds"))Filtering for count of 10 across 6 or more samples.
library(here)
library(SummarizedExperiment)
se <- readRDS(here("data", "glinos_se.rds"))
keep <- rowSums(assay(se) >= 10) >= 6
table(keep)keep
FALSE TRUE
77437 16193
se <- se[keep,]Manual data processing:
library(DESeq2)
library(tximeta)
colsums <- colSums(assay(se, "counts"))
cpm <- t(t(assay(se, "counts")) / colsums * 1e6)
assay(se, "abundance") <- cpm
# dummy lengths... no correction for length
assay(se, "length", withDimnames=FALSE) <- matrix(1000, nrow=nrow(se), ncol=ncol(se))library(dplyr)
se$condition <- case_when(
se$condition == "ctrl" ~ "WT",
se$condition == "exp" ~ "KD"
)
# note that protein knockdown will be used as reference
# with this choice of factor levels
se$condition <- factor(se$condition, c("KD","WT"))
se$condition [1] WT WT WT WT WT WT KD KD KD KD KD KD KD
Levels: KD WT
Load a TxDb and produce some objects telling us about the transcripts.
library(stringr)
library(GenomicFeatures)
#txdb <- makeTxDbFromGFF(here("data","flair_filter_transcripts.gtf.gz"))
#saveDb(txdb, file=here("data","flair_filter_transcripts.sqlite"))
txdb <- loadDb(here("data","flair_filter_transcripts.sqlite"))
exons <- exonsBy(txdb, by="tx")
txps <- AnnotationDbi::select(txdb, keys(txdb, "TXID"), c("TXNAME","GENEID"), "TXID") |>
tibble::as_tibble() |>
dplyr::mutate(TXID = as.character(TXID))
# better names for exons:
length(exons)[1] 93718
all.equal(names(exons), txps$TXID)[1] TRUE
txps <- txps |>
dplyr::mutate(new_name = case_when(
str_detect(TXNAME,"ENST") ~ TXNAME,
TRUE ~ paste0(GENEID, "-", str_sub(TXNAME, 1, 8))
))
names(exons) <- txps$new_name
#saveRDS(exons, file=here("data","glinos_exons.rds"))Summarize to gene level:
# subsetting to only txp/gene with complete metadata
# and renaming to have more legible names
rownames(se) <- str_replace(rownames(se),"_ENSG\\d+\\.\\d+$","")
table(rownames(se) %in% txps$TXNAME) # these appear to be not genes
FALSE TRUE
933 15260
txps <- txps |>
filter(TXNAME %in% rownames(se))
se <- se[txps$TXNAME,]
rownames(se) <- txps$new_name
rowData(se)$isoform_id <- txps$new_name
rowData(se)$gene_id <- txps$GENEID
tx2gene <- rowData(se)[,c("isoform_id","gene_id")]
gse <- summarizeToGene(se, tx2gene = tx2gene, skipRanges=TRUE)summarizing abundance
summarizing counts
summarizing length
#saveRDS(gse, file=here("data","glinos_gene_counts.rds"))DESeq2 DGE:
dds <- DESeqDataSet(gse, ~condition)converting counts to integer mode
keep <- rowSums(counts(dds) >= 10) >= 6
table(keep)keep
TRUE
8780
dds <- DESeq(dds, minRep=Inf)estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
res <- results(dds, cooksCutoff=FALSE)
summary(res)
out of 8780 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 1499, 17%
LFC < 0 (down) : 1469, 17%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 7)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Add gene symbols manually, make a simple plot of PTBP1 (the knocked-down gene).
library(org.Hs.eg.db)
gene_anno <- DataFrame(gene_id = str_replace(rownames(dds),"\\.\\d+",""),
baseMean = round(rowData(dds)$baseMean))
gene_anno$symbol <- org.Hs.eg.db |> mapIds(gene_anno$gene_id, "SYMBOL", "ENSEMBL")'select()' returned 1:many mapping between keys and columns
gene_anno$symbol[is.na(gene_anno$symbol)] <- gene_anno$gene_id[is.na(gene_anno$symbol)]
rowData(dds)$symbol <- gene_anno$symbol
idx <- which(rowData(dds)$symbol == "PTBP1")
set.seed(5) # for jittered plots
plotCounts(dds, gene=idx)
PCA plot:
vsd <- vst(dds, blind=FALSE)
plotPCA(vsd)using ntop=500 top features by variance

#saveRDS(dds, file=here("data", "glinos_deseq2_gene.rds"))
#saveRDS(vsd, file=here("data", "glinos_vst_gene.rds"))Make a Glimma report:
library(Glimma)
g <- glimmaMA(
dds,
groups=dds$condition,
anno=gene_anno[,-1],
main="Glinos et al. (2022) - PTBP1 knockdown", # title above left summary plot
transform.counts="cpm",
test="Wald", name="condition_WT_vs_KD",
independentFiltering=FALSE,
cooksCutoff=FALSE
)
htmlwidgets::saveWidget(g, here("data","glinos_ma-plot.html"))Now, let’s look for DTU, first we do some processing of the SummarizedExperiment with plyxp.
library(plyxp)
xp <- se |>
new_plyxp()
xp <- xp |>
mutate(rows(gene = str_replace(gene_id, "\\.\\d+", ""),
symbol = mapIds(org.Hs.eg.db, gene, "SYMBOL", "ENSEMBL"),
symbol = dplyr::case_when(
is.na(symbol) ~ gene,
TRUE ~ symbol
)))
tab <- table(rowData(se)$gene_id)
one_isoform_gene <- names(tab)[ tab == 1 ]
xp <- xp |>
filter(rows(!gene_id %in% one_isoform_gene)) |>
arrange(rows(gene_id))head(table(table(rowData(xp)$gene_id))) # confirm no single isoform genes
2 3 4 5 6 7
1957 780 305 111 73 40
Extract the SE after we are done with plyxp filtering.
se <- se(xp)Run satuRn for DTU, first fit the model:
# 5 seconds
library(satuRn)
se <- satuRn::fitDTU(se, formula=~0 + condition)Below we compare WT (wildtype) to KD (knockdown), which is perhaps the reverse of how one might expect to define the comparison. Typically wildtype or control is used as the reference that the experimental condition is compared to. In this case, we use the knockdown as the reference, as it provides us with log2 fold changes with respect to the activity of the protein. For example, a transcript with a positive coefficient will be more likely to be expressed when the protein is abundant, and a transcript with a negative coefficient will be less likely to be expressed when the protein is abundant. Exons that appear in transcripts with negative coefficients may be down-regulated by the protein.
design <- model.matrix(~0 + condition, colData(se))
colnames(design)[1] "conditionKD" "conditionWT"
colnames(design) <- levels(se$condition)
L <- limma::makeContrasts(
WT_vs_KD = WT - KD,
levels = design
)
L # contrast matrix Contrasts
Levels WT_vs_KD
KD -1
WT 1
se <- satuRn::testDTU(
object = se,
contrasts = L,
diagplot1 = TRUE,
diagplot2 = TRUE
)

#saveRDS(se, file=here("data","glinos_saturn_dtu.rds"))Extract test results:
sig_res <- rowData(se)[["fitDTUResult_WT_vs_KD"]] |>
tibble::as_tibble() |>
dplyr::bind_cols(as.data.frame(rowData(se)[,1:4])) |>
dplyr::filter(empirical_FDR < .1) |>
dplyr::select(gene_id, isoform_id, symbol, estimates, empirical_pval, empirical_FDR) |>
dplyr::arrange(empirical_pval)
#write.csv(sig_res, file=here("data","glinos_saturn_dtu.csv"), quote=FALSE, row.names=FALSE)
sig_res# A tibble: 61 × 6
gene_id isoform_id symbol estimates empirical_pval empirical_FDR
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 ENSG00000196923.13 ENST00000355841.6 PDLIM7 5.14 0.000000309 0.00304
2 ENSG00000198467.13 ENST00000378292.7 TPM2 2.93 0.00000324 0.0108
3 ENSG00000138674.16 ENSG00000138674.16-d16d19d1 SEC31A 2.41 0.00000330 0.0108
4 ENSG00000196923.13 ENSG00000196923.13-00a9e7dd PDLIM7 -4.92 0.00000854 0.0210
5 ENSG00000198467.13 ENSG00000198467.13-7a434697 TPM2 -3.27 0.0000170 0.0252
6 ENSG00000161203.13 ENSG00000161203.13-29f67f36 AP2M1 -1.51 0.0000172 0.0252
7 ENSG00000134285.10 ENST00000552878.5-1 FKBP11 -1.98 0.0000179 0.0252
8 ENSG00000134285.10 ENST00000550765.5 FKBP11 1.98 0.0000211 0.0259
9 ENSG00000167460.14 ENST00000300933.8 TPM4 1.35 0.0000325 0.0304
10 ENSG00000161203.13 ENST00000382456.7 AP2M1 1.83 0.0000365 0.0304
# ℹ 51 more rows
As a demonstration of DTU, look at counts for the top gene:
cpm <- assay(se, "abundance")[rowData(se)$symbol == "PDLIM7",]
rownames(cpm) <- seq_len(nrow(cpm))
colnames(cpm) <- se$condition
round(cpm) WT WT WT WT WT WT KD KD KD KD KD KD KD
1 1 1 1 2 1 0 72 45 37 36 38 35 55
2 1 2 2 2 1 1 295 190 150 146 175 183 236
3 124 105 114 94 104 84 15 9 14 11 9 10 10
4 4 2 4 2 4 1 19 9 15 13 11 12 7
5 7 5 8 10 9 7 8 5 6 4 4 5 5
6 0 0 1 0 2 0 6 7 6 6 5 8 5
sessionInfo()R version 4.5.0 (2025-04-11)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods base
other attached packages:
[1] satuRn_1.16.0 plyxp_1.2.0 org.Hs.eg.db_3.21.0
[4] GenomicFeatures_1.60.0 AnnotationDbi_1.70.0 stringr_1.5.1
[7] dplyr_1.1.4 tximeta_1.26.1 DESeq2_1.48.1
[10] SummarizedExperiment_1.38.0 Biobase_2.68.0 GenomicRanges_1.60.0
[13] GenomeInfoDb_1.44.0 IRanges_2.42.0 S4Vectors_0.46.0
[16] BiocGenerics_0.54.0 generics_0.1.3 MatrixGenerics_1.20.0
[19] matrixStats_1.5.0 here_1.0.1 rmarkdown_2.29
[22] devtools_2.4.5 usethis_3.1.0
loaded via a namespace (and not attached):
[1] RColorBrewer_1.1-3 jsonlite_2.0.0 tximport_1.36.0
[4] magrittr_2.0.3 farver_2.1.2 fs_1.6.6
[7] BiocIO_1.18.0 vctrs_0.6.5 locfdr_1.1-8
[10] memoise_2.0.1 Rsamtools_2.24.0 RCurl_1.98-1.17
[13] htmltools_0.5.8.1 S4Arrays_1.7.2 progress_1.2.3
[16] AnnotationHub_3.16.0 curl_6.2.2 SparseArray_1.8.0
[19] htmlwidgets_1.6.4 httr2_1.1.2 cachem_1.1.0
[22] GenomicAlignments_1.44.0 mime_0.13 lifecycle_1.0.4
[25] pkgconfig_2.0.3 Matrix_1.7-3 R6_2.6.1
[28] fastmap_1.2.0 GenomeInfoDbData_1.2.14 shiny_1.10.0
[31] digest_0.6.37 rprojroot_2.0.4 pkgload_1.4.0
[34] RSQLite_2.3.11 labeling_0.4.3 filelock_1.0.3
[37] httr_1.4.7 abind_1.4-8 compiler_4.5.0
[40] remotes_2.5.0 withr_3.0.2 bit64_4.6.0-1
[43] S7_0.2.0 BiocParallel_1.42.0 DBI_1.2.3
[46] pkgbuild_1.4.7 biomaRt_2.64.0 rappdirs_0.3.3
[49] DelayedArray_0.34.1 sessioninfo_1.2.3 rjson_0.2.23
[52] tools_4.5.0 httpuv_1.6.16 glue_1.8.0
[55] restfulr_0.0.15 promises_1.3.2 grid_4.5.0
[58] gtable_0.3.6 tidyr_1.3.1 ensembldb_2.32.0
[61] hms_1.1.3 utf8_1.2.4 xml2_1.3.8
[64] XVector_0.48.0 BiocVersion_3.21.1 pillar_1.10.2
[67] limma_3.64.1 later_1.4.2 splines_4.5.0
[70] BiocFileCache_2.16.0 lattice_0.22-7 rtracklayer_1.68.0
[73] bit_4.6.0 tidyselect_1.2.1 locfit_1.5-9.12
[76] pbapply_1.7-2 Biostrings_2.76.0 miniUI_0.1.2
[79] knitr_1.50 ProtGenerics_1.40.0 xfun_0.52
[82] statmod_1.5.0 stringi_1.8.7 UCSC.utils_1.4.0
[85] boot_1.3-31 lazyeval_0.2.2 yaml_2.3.10
[88] evaluate_1.0.3 codetools_0.2-20 tibble_3.2.1
[91] BiocManager_1.30.25 cli_3.6.5 xtable_1.8-4
[94] Rcpp_1.0.14 dbplyr_2.5.0 png_0.1-8
[97] XML_3.99-0.18 parallel_4.5.0 ellipsis_0.3.2
[100] ggplot2_3.5.2 blob_1.2.4 prettyunits_1.2.0
[103] profvis_0.4.0 AnnotationFilter_1.32.0 urlchecker_1.0.1
[106] bitops_1.0-9 txdbmaker_1.4.1 scales_1.4.0
[109] purrr_1.0.4 crayon_1.5.3 rlang_1.1.6
[112] KEGGREST_1.48.0