2 Exploratory data analysis

How can we explore transcript expression from long read data…

Let’s load some data…

library(here)

here() starts at /Users/milove/bioc/tapir

url <- "https://github.com/gandallab/Dev_Brain_IsoSeq/raw/main/data/cp_vz_0.75_min_7_recovery_talon_abundance_filtered.tsv.gz"
filename <- basename(url)
path <- here("data",filename)
if (!file.exists(path)) {
  download.file(url, path)
}

library(readr)


Attaching package: 'readr'

The following objects are masked from 'package:testthat':

    edition_get, local_edition

raw_abundance_table <- read_delim(path)

Rows: 214516 Columns: 35

── Column specification ────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (7): annot_gene_id, annot_transcript_id, annot_gene_name, annot_transcript_name, gene_novel...
dbl (28): gene_ID, transcript_ID, n_exons, length, 209_1_VZ, 209_2_VZ, 209_3_VZ, 209_4_VZ, 334_1...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dim(raw_abundance_table)

[1] 214516     35

colnames(raw_abundance_table)

 [1] "gene_ID"               "transcript_ID"         "annot_gene_id"         "annot_transcript_id"  
 [5] "annot_gene_name"       "annot_transcript_name" "n_exons"               "length"               
 [9] "gene_novelty"          "transcript_novelty"    "ISM_subtype"           "209_1_VZ"             
[13] "209_2_VZ"              "209_3_VZ"              "209_4_VZ"              "334_1_VZ"             
[17] "334_2_VZ"              "334_3_VZ"              "334_4_VZ"              "336_1_VZ"             
[21] "336_2_VZ"              "336_3_VZ"              "336_4_VZ"              "334_1_CP"             
[25] "334_2_CP"              "334_3_CP"              "334_4_CP"              "209_1_CP"             
[29] "209_2_CP"              "209_3_CP"              "209_4_CP"              "336_1_CP"             
[33] "336_2_CP"              "336_3_CP"              "336_4_CP"

raw_abundance_table[1:5,1:15]

# A tibble: 5 × 15
  gene_ID transcript_ID annot_gene_id      annot_transcript_id annot_gene_name annot_transcript_name
    <dbl>         <dbl> <chr>              <chr>               <chr>           <chr>                
1      12            21 ENSG00000268903.1… ENST00000494149.2_2 AL627309.6      AL627309.6-201       
2      18            34 ENSG00000228463.4  ENST00000424587.2   AP006222.2      AP006222.2-001       
3      22            39 ENSG00000224813.2  ENST00000445840.1   RP4-669L17.4    RP4-669L17.4-001     
4      32            65 ENSG00000225630.1… ENST00000457540.1_1 MTND2P28        MTND2P28-201         
5      33            66 ENSG00000237973.1… ENST00000414273.1_1 MTCO1P12        MTCO1P12-201         
# ℹ 9 more variables: n_exons <dbl>, length <dbl>, gene_novelty <chr>, transcript_novelty <chr>,
#   ISM_subtype <chr>, `209_1_VZ` <dbl>, `209_2_VZ` <dbl>, `209_3_VZ` <dbl>, `209_4_VZ` <dbl>

library(dplyr)


Attaching package: 'dplyr'

The following object is masked from 'package:testthat':

    matches

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(tidyr)


Attaching package: 'tidyr'

The following object is masked from 'package:testthat':

    matches

raw_abundance_table |>
  select(contains(c("VZ","CP"))) |>
  summarize(across(everything(), sum))

# A tibble: 1 × 24
  `209_1_VZ` `209_2_VZ` `209_3_VZ` `209_4_VZ` `334_1_VZ` `334_2_VZ` `334_3_VZ` `334_4_VZ` `336_1_VZ`
       <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
1     623608     542318     501999     529527    1273843    1594927    1160553    1546123    1328077
# ℹ 15 more variables: `336_2_VZ` <dbl>, `336_3_VZ` <dbl>, `336_4_VZ` <dbl>, `334_1_CP` <dbl>,
#   `334_2_CP` <dbl>, `334_3_CP` <dbl>, `334_4_CP` <dbl>, `209_1_CP` <dbl>, `209_2_CP` <dbl>,
#   `209_3_CP` <dbl>, `209_4_CP` <dbl>, `336_1_CP` <dbl>, `336_2_CP` <dbl>, `336_3_CP` <dbl>,
#   `336_4_CP` <dbl>

library(here)
filename <- "cp_vz_0.75_min_7_recovery_talon_abundance_filtered.tsv.gz"
path <- here("data",filename)
library(readr)
raw_abundance_table <- read_delim(path)

Rows: 214516 Columns: 35
── Column specification ────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (7): annot_gene_id, annot_transcript_id, annot_gene_name, annot_transcript_name, gene_novel...
dbl (28): gene_ID, transcript_ID, n_exons, length, 209_1_VZ, 209_2_VZ, 209_3_VZ, 209_4_VZ, 334_1...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dim(raw_abundance_table)

[1] 214516     35

colnames(raw_abundance_table)

 [1] "gene_ID"               "transcript_ID"         "annot_gene_id"         "annot_transcript_id"  
 [5] "annot_gene_name"       "annot_transcript_name" "n_exons"               "length"               
 [9] "gene_novelty"          "transcript_novelty"    "ISM_subtype"           "209_1_VZ"             
[13] "209_2_VZ"              "209_3_VZ"              "209_4_VZ"              "334_1_VZ"             
[17] "334_2_VZ"              "334_3_VZ"              "334_4_VZ"              "336_1_VZ"             
[21] "336_2_VZ"              "336_3_VZ"              "336_4_VZ"              "334_1_CP"             
[25] "334_2_CP"              "334_3_CP"              "334_4_CP"              "209_1_CP"             
[29] "209_2_CP"              "209_3_CP"              "209_4_CP"              "336_1_CP"             
[33] "336_2_CP"              "336_3_CP"              "336_4_CP"

library(dplyr)
counts <- raw_abundance_table |>
  select(gene_id = annot_gene_id,
         feature_id = annot_transcript_id,
         contains(c("VZ","CP")))
counts <- counts |>
  rename_with(.cols = contains(c("VZ","CP")),
              \(x) paste0("s", x))
library(tidyr)
samples <- tibble(sample_id = colnames(counts)[-c(1:2)]) |>
  separate(sample_id, into=c("unit","rep","condition"), sep="_", remove=FALSE) |>
  mutate_at(c("rep","condition"), factor)