Differential expression from RNA-seq

dir <- system.file("extdata", package="macrophage")
library(tximeta)
makeLinkedTxome(
  indexDir=file.path(dir, "gencode.v29_salmon_0.12.0"),
  source="Gencode",
  organism="Homo sapiens",
  release="29",
  genome="GRCh38",
  fasta="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz",
  gtf=file.path(dir, "gencode.v29.annotation.gtf.gz"), # local version
  write=FALSE
)
dir <- system.file("extdata", package="macrophage")
library(dplyr)
## 
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
## 
##     filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
colfile <- file.path(dir, "coldata.csv")
coldata <- read_csv(colfile) |>
  dplyr::select(
    names,
    id = sample_id,
    line = line_id,
    condition = condition_name
  ) |>
  dplyr::mutate(
    files = file.path(dir, "quants", names, "quant.sf.gz"),
    line = factor(line),
    condition = relevel(factor(condition), "naive")
  )
## Rows: 24 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): names, sample_id, line_id, condition_name, macrophage_harvest, sal...
## dbl  (3): replicate, ng_ul_mean, rna_auto
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
coldata
## # A tibble: 24 × 5
##    names          id     line   condition   files                               
##    <chr>          <chr>  <fct>  <fct>       <chr>                               
##  1 SAMEA103885102 diku_A diku_1 naive       C:/Users/cason/AppData/Local/R/win-…
##  2 SAMEA103885347 diku_B diku_1 IFNg        C:/Users/cason/AppData/Local/R/win-…
##  3 SAMEA103885043 diku_C diku_1 SL1344      C:/Users/cason/AppData/Local/R/win-…
##  4 SAMEA103885392 diku_D diku_1 IFNg_SL1344 C:/Users/cason/AppData/Local/R/win-…
##  5 SAMEA103885182 eiwy_A eiwy_1 naive       C:/Users/cason/AppData/Local/R/win-…
##  6 SAMEA103885136 eiwy_B eiwy_1 IFNg        C:/Users/cason/AppData/Local/R/win-…
##  7 SAMEA103885413 eiwy_C eiwy_1 SL1344      C:/Users/cason/AppData/Local/R/win-…
##  8 SAMEA103884967 eiwy_D eiwy_1 IFNg_SL1344 C:/Users/cason/AppData/Local/R/win-…
##  9 SAMEA103885368 fikt_A fikt_3 naive       C:/Users/cason/AppData/Local/R/win-…
## 10 SAMEA103885218 fikt_B fikt_3 IFNg        C:/Users/cason/AppData/Local/R/win-…
## # ℹ 14 more rows
library(SummarizedExperiment)
## Caricamento del pacchetto richiesto: MatrixGenerics
## Caricamento del pacchetto richiesto: matrixStats
## 
## Caricamento pacchetto: 'matrixStats'
## Il seguente oggetto è mascherato da 'package:dplyr':
## 
##     count
## 
## Caricamento pacchetto: 'MatrixGenerics'
## I seguenti oggetti sono mascherati da 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Caricamento del pacchetto richiesto: GenomicRanges
## Caricamento del pacchetto richiesto: stats4
## Caricamento del pacchetto richiesto: BiocGenerics
## Caricamento del pacchetto richiesto: generics
## 
## Caricamento pacchetto: 'generics'
## Il seguente oggetto è mascherato da 'package:dplyr':
## 
##     explain
## I seguenti oggetti sono mascherati da 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union
## 
## Caricamento pacchetto: 'BiocGenerics'
## Il seguente oggetto è mascherato da 'package:dplyr':
## 
##     combine
## I seguenti oggetti sono mascherati da 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## I seguenti oggetti sono mascherati da 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min
## Caricamento del pacchetto richiesto: S4Vectors
## 
## Caricamento pacchetto: 'S4Vectors'
## I seguenti oggetti sono mascherati da 'package:dplyr':
## 
##     first, rename
## Il seguente oggetto è mascherato da 'package:utils':
## 
##     findMatches
## I seguenti oggetti sono mascherati da 'package:base':
## 
##     expand.grid, I, unname
## Caricamento del pacchetto richiesto: IRanges
## 
## Caricamento pacchetto: 'IRanges'
## I seguenti oggetti sono mascherati da 'package:dplyr':
## 
##     collapse, desc, slice
## Il seguente oggetto è mascherato da 'package:grDevices':
## 
##     windows
## Caricamento del pacchetto richiesto: GenomeInfoDb
## Caricamento del pacchetto richiesto: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Caricamento pacchetto: 'Biobase'
## Il seguente oggetto è mascherato da 'package:MatrixGenerics':
## 
##     rowMedians
## I seguenti oggetti sono mascherati da 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(tximeta)
se <- tximeta(coldata, dropInfReps=TRUE, useHub=FALSE, skipSeqinfo=TRUE)
## importing quantifications
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
## found matching linked transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## loading existing TxDb created: 2025-07-08 11:34:32
## Caricamento del pacchetto richiesto: GenomicFeatures
## Caricamento del pacchetto richiesto: AnnotationDbi
## 
## Caricamento pacchetto: 'AnnotationDbi'
## 
## Il seguente oggetto è mascherato da 'package:dplyr':
## 
##     select
## 
## loading existing transcript ranges created: 2025-07-08 11:34:33
gse <- summarizeToGene(se, assignRanges="abundant")
## loading existing TxDb created: 2025-07-08 11:34:32
## obtaining transcript-to-gene mapping from database
## loading existing gene ranges created: 2025-07-08 11:34:45
## assignRanges='abundant': gene ranges assigned by isoform abundance
##   see details at: ?summarizeToGene,SummarizedExperiment-method
## summarizing abundance
## summarizing counts
## summarizing length
library(DESeq2)
dds <- DESeqDataSet(gse, ~line + condition)
## using counts and average transcript lengths from tximeta
dds <- dds[,dds$condition %in% c("naive","IFNg")]
dds$condition <- droplevels(dds$condition)
dds$condition <- relevel(dds$condition, "naive")
keep <- rowSums(counts(dds) >= 10) >= 6
dds <- dds[keep,]

The model is fit with the following line of code:

dds <- DESeq(dds)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds, lfcThreshold=1)
summary(res)
## 
## out of 16086 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 1.00 (up)    : 793, 4.9%
## LFC < -1.00 (down) : 523, 3.3%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

To see the results of the expression analysis, we can generate a summary table and an MA plot:

DESeq2::plotMA(res, ylim=c(-10,10))
library(plyranges)
## 
## Caricamento pacchetto: 'plyranges'
## Il seguente oggetto è mascherato da 'package:AnnotationDbi':
## 
##     select
## Il seguente oggetto è mascherato da 'package:IRanges':
## 
##     slice
## I seguenti oggetti sono mascherati da 'package:dplyr':
## 
##     between, n, n_distinct
## Il seguente oggetto è mascherato da 'package:stats':
## 
##     filter
all_genes <- results(dds, lfcThreshold=1, format="GRanges") |>
  names_to_column("gene_id") |>
  select(gene_id, de_log2FC = log2FoldChange, de_padj = padj, de_pval = pvalue)
genome(all_genes) <- "hg38"
si <- Seqinfo(genome="hg38")
#save(Seqinfo, file="seqinfo.rda")
#load("seqinfo.rda")
si <- keepStandardChromosomes(si)
seqinfo(all_genes) <- si

Differential accessibility from ATAC-seq

The following section describes the process we have used for generating a GRanges object of differential peaks from the ATAC-seq data in @alasoo. The code chunks for the remainder of this section are not run. For assessing differential accessibility, we followed the original paper, using limma [@Smyth2004], and generating a summary of LFCs and adjusted p-values for the peaks.

library(fluentGenomics)
# atac <- readRDS(cache_atac_se())
library(limma)
design <- model.matrix(~donor + condition, colData(atac))
fit <- lmFit(assay(atac), design)
fit <- eBayes(fit)
idx <- which(colnames(fit$coefficients) == "conditionIFNg")
tt <- topTable(fit, coef=idx, sort.by="none", n=nrow(atac))
atac_peaks <- rowRanges(atac) |>
  remove_names() |>
  mutate(
    da_log2FC = tt$logFC,
    da_padj = tt$adj.P.Val
  ) |>
  set_genome_info(genome = "hg38")
seqlevelsStyle(atac_peaks) <- "UCSC"

The final GRanges object containing the DA peaks is included in the workflow package and can be loaded as follows:

library(fluentGenomics)
peaks
## GRanges object with 296220 ranges and 3 metadata columns:
##            seqnames              ranges strand |          peak_id da_log2FC
##               <Rle>           <IRanges>  <Rle> |      <character> <numeric>
##        [1]     chr1          9979-10668      * |      ATAC_peak_1  0.266185
##        [2]     chr1         10939-11473      * |      ATAC_peak_2  0.322177
##        [3]     chr1         15505-15729      * |      ATAC_peak_3 -0.574160
##        [4]     chr1         21148-21481      * |      ATAC_peak_4 -1.147066
##        [5]     chr1         21864-22067      * |      ATAC_peak_5 -0.896143
##        ...      ...                 ...    ... .              ...       ...
##   [296216]     chrX 155896572-155896835      * | ATAC_peak_296216 -0.834629
##   [296217]     chrX 155958507-155958646      * | ATAC_peak_296217 -0.147537
##   [296218]     chrX 156016760-156016975      * | ATAC_peak_296218 -0.609732
##   [296219]     chrX 156028551-156029422      * | ATAC_peak_296219 -0.347678
##   [296220]     chrX 156030135-156030785      * | ATAC_peak_296220  0.492442
##                da_padj
##              <numeric>
##        [1] 9.10673e-05
##        [2] 2.03435e-05
##        [3] 3.41708e-08
##        [4] 8.22299e-26
##        [5] 4.79453e-11
##        ...         ...
##   [296216] 1.33546e-11
##   [296217] 3.13015e-01
##   [296218] 3.62339e-09
##   [296219] 6.94823e-06
##   [296220] 7.07664e-13
##   -------
##   seqinfo: 23 sequences from hg38 genome; no seqlengths
seqlevels(peaks) <- seqlevels(si)
seqinfo(peaks) <- si

Integration of RNA-seq and ATAC-seq differential results

da_peaks <- peaks |>
  filter(da_padj < .01, abs(da_log2FC) > .5)
tss_by_de <- all_genes |>
  mutate(de_sig =
           case_when(
             de_padj <= .01 ~ "de",
             TRUE ~ "non-de"
         )) |>
  filter(!dplyr::between(de_padj, .01, .99)) |>
  anchor_5p() |>
  mutate(width=1)
dist_res <- tss_by_de |>
  add_nearest_distance(da_peaks)
dist_res_clean <- dist_res |>
  as_tibble() |>
  tidyr::drop_na()
dist_res_clean |>
  group_by(de_sig) |>
  summarize(mean = mean(distance), sd = sd(distance))
## # A tibble: 2 × 3
##   de_sig  mean     sd
##   <chr>  <dbl>  <dbl>
## 1 de     3173.  5927.
## 2 non-de 7705. 13266.
library(ggplot2)
dist_res_clean |>
  filter(distance < 1e6) |>
  mutate(log10_dist = log10(distance + 1)) |>
  ggplot(aes(log10_dist, color=de_sig)) +
  geom_density()

dist_res %>%
  mutate(near_peaks = count_overlaps(., da_peaks, maxgap=100)) |>
  as_tibble() |> 
  dplyr::count(de_sig, near_peaks)
## # A tibble: 6 × 3
##   de_sig near_peaks     n
##   <chr>       <int> <int>
## 1 de              0   664
## 2 de              1   338
## 3 de              2     4
## 4 non-de          0 12840
## 5 non-de          1  1280
## 6 non-de          2    28
library(nullranges)
da_peaks_chr <- da_peaks |>
  dropSeqlevels(c("chrY","chrM")) |>
  sort()
set.seed(1)
boot <- bootRanges(da_peaks_chr, blockLength=1e6, R=5)
all_peaks <- bind_ranges(
  da_peaks %>% mutate(iter=0),
  boot
)
tss_by_de |>
  join_overlap_inner(all_peaks, maxgap=100) |>
  as_tibble() |>
  select(gene_id, peak_id, de_sig, iter) |>
  group_by(de_sig, iter) |>
  summarize(overlaps_any = n_distinct(gene_id))
## `summarise()` has grouped output by 'de_sig'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 3
## # Groups:   de_sig [2]
##    de_sig iter  overlaps_any
##    <chr>  <fct>        <int>
##  1 de     0              342
##  2 de     1               33
##  3 de     2               24
##  4 de     3               20
##  5 de     4               19
##  6 de     5               33
##  7 non-de 0             1308
##  8 non-de 1              407
##  9 non-de 2              390
## 10 non-de 3              369
## 11 non-de 4              373
## 12 non-de 5              378