How can we explore transcript expression from long read data…
Let’s load some data…
here() starts at /Users/milove/bioc/tapir
url <- "https://github.com/gandallab/Dev_Brain_IsoSeq/raw/main/data/cp_vz_0.75_min_7_recovery_talon_abundance_filtered.tsv.gz"
filename <- basename (url)
path <- here ("data" ,filename)
if (! file.exists (path)) {
download.file (url, path)
}
Attaching package: 'readr'
The following objects are masked from 'package:testthat':
edition_get, local_edition
raw_abundance_table <- read_delim (path)
── Column specification ────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): annot_gene_id, annot_transcript_id, annot_gene_name, annot_transcript_name, gene_novel...
dbl (28): gene_ID, transcript_ID, n_exons, length, 209_1_VZ, 209_2_VZ, 209_3_VZ, 209_4_VZ, 334_1...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames (raw_abundance_table)
[1] "gene_ID" "transcript_ID" "annot_gene_id" "annot_transcript_id"
[5] "annot_gene_name" "annot_transcript_name" "n_exons" "length"
[9] "gene_novelty" "transcript_novelty" "ISM_subtype" "209_1_VZ"
[13] "209_2_VZ" "209_3_VZ" "209_4_VZ" "334_1_VZ"
[17] "334_2_VZ" "334_3_VZ" "334_4_VZ" "336_1_VZ"
[21] "336_2_VZ" "336_3_VZ" "336_4_VZ" "334_1_CP"
[25] "334_2_CP" "334_3_CP" "334_4_CP" "209_1_CP"
[29] "209_2_CP" "209_3_CP" "209_4_CP" "336_1_CP"
[33] "336_2_CP" "336_3_CP" "336_4_CP"
raw_abundance_table[1 : 5 ,1 : 15 ]
# A tibble: 5 × 15
gene_ID transcript_ID annot_gene_id annot_transcript_id annot_gene_name annot_transcript_name
<dbl> <dbl> <chr> <chr> <chr> <chr>
1 12 21 ENSG00000268903.1… ENST00000494149.2_2 AL627309.6 AL627309.6-201
2 18 34 ENSG00000228463.4 ENST00000424587.2 AP006222.2 AP006222.2-001
3 22 39 ENSG00000224813.2 ENST00000445840.1 RP4-669L17.4 RP4-669L17.4-001
4 32 65 ENSG00000225630.1… ENST00000457540.1_1 MTND2P28 MTND2P28-201
5 33 66 ENSG00000237973.1… ENST00000414273.1_1 MTCO1P12 MTCO1P12-201
# ℹ 9 more variables: n_exons <dbl>, length <dbl>, gene_novelty <chr>, transcript_novelty <chr>,
# ISM_subtype <chr>, `209_1_VZ` <dbl>, `209_2_VZ` <dbl>, `209_3_VZ` <dbl>, `209_4_VZ` <dbl>
Attaching package: 'dplyr'
The following object is masked from 'package:testthat':
matches
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Attaching package: 'tidyr'
The following object is masked from 'package:testthat':
matches
raw_abundance_table |>
select (contains (c ("VZ" ,"CP" ))) |>
summarize (across (everything (), sum))
# A tibble: 1 × 24
`209_1_VZ` `209_2_VZ` `209_3_VZ` `209_4_VZ` `334_1_VZ` `334_2_VZ` `334_3_VZ` `334_4_VZ` `336_1_VZ`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 623608 542318 501999 529527 1273843 1594927 1160553 1546123 1328077
# ℹ 15 more variables: `336_2_VZ` <dbl>, `336_3_VZ` <dbl>, `336_4_VZ` <dbl>, `334_1_CP` <dbl>,
# `334_2_CP` <dbl>, `334_3_CP` <dbl>, `334_4_CP` <dbl>, `209_1_CP` <dbl>, `209_2_CP` <dbl>,
# `209_3_CP` <dbl>, `209_4_CP` <dbl>, `336_1_CP` <dbl>, `336_2_CP` <dbl>, `336_3_CP` <dbl>,
# `336_4_CP` <dbl>
library (here)
filename <- "cp_vz_0.75_min_7_recovery_talon_abundance_filtered.tsv.gz"
path <- here ("data" ,filename)
library (readr)
raw_abundance_table <- read_delim (path)
Rows: 214516 Columns: 35
── Column specification ────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): annot_gene_id, annot_transcript_id, annot_gene_name, annot_transcript_name, gene_novel...
dbl (28): gene_ID, transcript_ID, n_exons, length, 209_1_VZ, 209_2_VZ, 209_3_VZ, 209_4_VZ, 334_1...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames (raw_abundance_table)
[1] "gene_ID" "transcript_ID" "annot_gene_id" "annot_transcript_id"
[5] "annot_gene_name" "annot_transcript_name" "n_exons" "length"
[9] "gene_novelty" "transcript_novelty" "ISM_subtype" "209_1_VZ"
[13] "209_2_VZ" "209_3_VZ" "209_4_VZ" "334_1_VZ"
[17] "334_2_VZ" "334_3_VZ" "334_4_VZ" "336_1_VZ"
[21] "336_2_VZ" "336_3_VZ" "336_4_VZ" "334_1_CP"
[25] "334_2_CP" "334_3_CP" "334_4_CP" "209_1_CP"
[29] "209_2_CP" "209_3_CP" "209_4_CP" "336_1_CP"
[33] "336_2_CP" "336_3_CP" "336_4_CP"
library (dplyr)
counts <- raw_abundance_table |>
select (gene_id = annot_gene_id,
feature_id = annot_transcript_id,
contains (c ("VZ" ,"CP" )))
counts <- counts |>
rename_with (.cols = contains (c ("VZ" ,"CP" )),
\(x) paste0 ("s" , x))
library (tidyr)
samples <- tibble (sample_id = colnames (counts)[- c (1 : 2 )]) |>
separate (sample_id, into= c ("unit" ,"rep" ,"condition" ), sep= "_" , remove= FALSE ) |>
mutate_at (c ("rep" ,"condition" ), factor)