dir <- system.file("extdata", package="macrophage")
library(tximeta)
makeLinkedTxome(
indexDir=file.path(dir, "gencode.v29_salmon_0.12.0"),
source="Gencode",
organism="Homo sapiens",
release="29",
genome="GRCh38",
fasta="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz",
gtf=file.path(dir, "gencode.v29.annotation.gtf.gz"), # local version
write=FALSE
)
dir <- system.file("extdata", package="macrophage")
library(dplyr)
##
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
colfile <- file.path(dir, "coldata.csv")
coldata <- read_csv(colfile) |>
dplyr::select(
names,
id = sample_id,
line = line_id,
condition = condition_name
) |>
dplyr::mutate(
files = file.path(dir, "quants", names, "quant.sf.gz"),
line = factor(line),
condition = relevel(factor(condition), "naive")
)
## Rows: 24 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): names, sample_id, line_id, condition_name, macrophage_harvest, sal...
## dbl (3): replicate, ng_ul_mean, rna_auto
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
coldata
## # A tibble: 24 × 5
## names id line condition files
## <chr> <chr> <fct> <fct> <chr>
## 1 SAMEA103885102 diku_A diku_1 naive C:/Users/cason/AppData/Local/R/win-…
## 2 SAMEA103885347 diku_B diku_1 IFNg C:/Users/cason/AppData/Local/R/win-…
## 3 SAMEA103885043 diku_C diku_1 SL1344 C:/Users/cason/AppData/Local/R/win-…
## 4 SAMEA103885392 diku_D diku_1 IFNg_SL1344 C:/Users/cason/AppData/Local/R/win-…
## 5 SAMEA103885182 eiwy_A eiwy_1 naive C:/Users/cason/AppData/Local/R/win-…
## 6 SAMEA103885136 eiwy_B eiwy_1 IFNg C:/Users/cason/AppData/Local/R/win-…
## 7 SAMEA103885413 eiwy_C eiwy_1 SL1344 C:/Users/cason/AppData/Local/R/win-…
## 8 SAMEA103884967 eiwy_D eiwy_1 IFNg_SL1344 C:/Users/cason/AppData/Local/R/win-…
## 9 SAMEA103885368 fikt_A fikt_3 naive C:/Users/cason/AppData/Local/R/win-…
## 10 SAMEA103885218 fikt_B fikt_3 IFNg C:/Users/cason/AppData/Local/R/win-…
## # ℹ 14 more rows
library(SummarizedExperiment)
## Caricamento del pacchetto richiesto: MatrixGenerics
## Caricamento del pacchetto richiesto: matrixStats
##
## Caricamento pacchetto: 'matrixStats'
## Il seguente oggetto è mascherato da 'package:dplyr':
##
## count
##
## Caricamento pacchetto: 'MatrixGenerics'
## I seguenti oggetti sono mascherati da 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Caricamento del pacchetto richiesto: GenomicRanges
## Caricamento del pacchetto richiesto: stats4
## Caricamento del pacchetto richiesto: BiocGenerics
## Caricamento del pacchetto richiesto: generics
##
## Caricamento pacchetto: 'generics'
## Il seguente oggetto è mascherato da 'package:dplyr':
##
## explain
## I seguenti oggetti sono mascherati da 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
## setequal, union
##
## Caricamento pacchetto: 'BiocGenerics'
## Il seguente oggetto è mascherato da 'package:dplyr':
##
## combine
## I seguenti oggetti sono mascherati da 'package:stats':
##
## IQR, mad, sd, var, xtabs
## I seguenti oggetti sono mascherati da 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
## unsplit, which.max, which.min
## Caricamento del pacchetto richiesto: S4Vectors
##
## Caricamento pacchetto: 'S4Vectors'
## I seguenti oggetti sono mascherati da 'package:dplyr':
##
## first, rename
## Il seguente oggetto è mascherato da 'package:utils':
##
## findMatches
## I seguenti oggetti sono mascherati da 'package:base':
##
## expand.grid, I, unname
## Caricamento del pacchetto richiesto: IRanges
##
## Caricamento pacchetto: 'IRanges'
## I seguenti oggetti sono mascherati da 'package:dplyr':
##
## collapse, desc, slice
## Il seguente oggetto è mascherato da 'package:grDevices':
##
## windows
## Caricamento del pacchetto richiesto: GenomeInfoDb
## Caricamento del pacchetto richiesto: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Caricamento pacchetto: 'Biobase'
## Il seguente oggetto è mascherato da 'package:MatrixGenerics':
##
## rowMedians
## I seguenti oggetti sono mascherati da 'package:matrixStats':
##
## anyMissing, rowMedians
library(tximeta)
se <- tximeta(coldata, dropInfReps=TRUE, useHub=FALSE, skipSeqinfo=TRUE)
## importing quantifications
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## found matching linked transcriptome:
## [ GENCODE - Homo sapiens - release 29 ]
## loading existing TxDb created: 2025-07-08 11:34:32
## Caricamento del pacchetto richiesto: GenomicFeatures
## Caricamento del pacchetto richiesto: AnnotationDbi
##
## Caricamento pacchetto: 'AnnotationDbi'
##
## Il seguente oggetto è mascherato da 'package:dplyr':
##
## select
##
## loading existing transcript ranges created: 2025-07-08 11:34:33
gse <- summarizeToGene(se, assignRanges="abundant")
## loading existing TxDb created: 2025-07-08 11:34:32
## obtaining transcript-to-gene mapping from database
## loading existing gene ranges created: 2025-07-08 11:34:45
## assignRanges='abundant': gene ranges assigned by isoform abundance
## see details at: ?summarizeToGene,SummarizedExperiment-method
## summarizing abundance
## summarizing counts
## summarizing length
library(DESeq2)
dds <- DESeqDataSet(gse, ~line + condition)
## using counts and average transcript lengths from tximeta
dds <- dds[,dds$condition %in% c("naive","IFNg")]
dds$condition <- droplevels(dds$condition)
dds$condition <- relevel(dds$condition, "naive")
keep <- rowSums(counts(dds) >= 10) >= 6
dds <- dds[keep,]
The model is fit with the following line of code:
dds <- DESeq(dds)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds, lfcThreshold=1)
summary(res)
##
## out of 16086 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 1.00 (up) : 793, 4.9%
## LFC < -1.00 (down) : 523, 3.3%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
To see the results of the expression analysis, we can generate a summary table and an MA plot:
DESeq2::plotMA(res, ylim=c(-10,10))
library(plyranges)
##
## Caricamento pacchetto: 'plyranges'
## Il seguente oggetto è mascherato da 'package:AnnotationDbi':
##
## select
## Il seguente oggetto è mascherato da 'package:IRanges':
##
## slice
## I seguenti oggetti sono mascherati da 'package:dplyr':
##
## between, n, n_distinct
## Il seguente oggetto è mascherato da 'package:stats':
##
## filter
all_genes <- results(dds, lfcThreshold=1, format="GRanges") |>
names_to_column("gene_id") |>
select(gene_id, de_log2FC = log2FoldChange, de_padj = padj, de_pval = pvalue)
genome(all_genes) <- "hg38"
si <- Seqinfo(genome="hg38")
#save(Seqinfo, file="seqinfo.rda")
#load("seqinfo.rda")
si <- keepStandardChromosomes(si)
seqinfo(all_genes) <- si
The following section describes the process we have used for generating a GRanges object of differential peaks from the ATAC-seq data in @alasoo. The code chunks for the remainder of this section are not run. For assessing differential accessibility, we followed the original paper, using limma [@Smyth2004], and generating a summary of LFCs and adjusted p-values for the peaks.
library(fluentGenomics)
# atac <- readRDS(cache_atac_se())
library(limma)
design <- model.matrix(~donor + condition, colData(atac))
fit <- lmFit(assay(atac), design)
fit <- eBayes(fit)
idx <- which(colnames(fit$coefficients) == "conditionIFNg")
tt <- topTable(fit, coef=idx, sort.by="none", n=nrow(atac))
atac_peaks <- rowRanges(atac) |>
remove_names() |>
mutate(
da_log2FC = tt$logFC,
da_padj = tt$adj.P.Val
) |>
set_genome_info(genome = "hg38")
seqlevelsStyle(atac_peaks) <- "UCSC"
The final GRanges object containing the DA peaks is included in the workflow package and can be loaded as follows:
library(fluentGenomics)
peaks
## GRanges object with 296220 ranges and 3 metadata columns:
## seqnames ranges strand | peak_id da_log2FC
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr1 9979-10668 * | ATAC_peak_1 0.266185
## [2] chr1 10939-11473 * | ATAC_peak_2 0.322177
## [3] chr1 15505-15729 * | ATAC_peak_3 -0.574160
## [4] chr1 21148-21481 * | ATAC_peak_4 -1.147066
## [5] chr1 21864-22067 * | ATAC_peak_5 -0.896143
## ... ... ... ... . ... ...
## [296216] chrX 155896572-155896835 * | ATAC_peak_296216 -0.834629
## [296217] chrX 155958507-155958646 * | ATAC_peak_296217 -0.147537
## [296218] chrX 156016760-156016975 * | ATAC_peak_296218 -0.609732
## [296219] chrX 156028551-156029422 * | ATAC_peak_296219 -0.347678
## [296220] chrX 156030135-156030785 * | ATAC_peak_296220 0.492442
## da_padj
## <numeric>
## [1] 9.10673e-05
## [2] 2.03435e-05
## [3] 3.41708e-08
## [4] 8.22299e-26
## [5] 4.79453e-11
## ... ...
## [296216] 1.33546e-11
## [296217] 3.13015e-01
## [296218] 3.62339e-09
## [296219] 6.94823e-06
## [296220] 7.07664e-13
## -------
## seqinfo: 23 sequences from hg38 genome; no seqlengths
seqlevels(peaks) <- seqlevels(si)
seqinfo(peaks) <- si
da_peaks <- peaks |>
filter(da_padj < .01, abs(da_log2FC) > .5)
tss_by_de <- all_genes |>
mutate(de_sig =
case_when(
de_padj <= .01 ~ "de",
TRUE ~ "non-de"
)) |>
filter(!dplyr::between(de_padj, .01, .99)) |>
anchor_5p() |>
mutate(width=1)
dist_res <- tss_by_de |>
add_nearest_distance(da_peaks)
dist_res_clean <- dist_res |>
as_tibble() |>
tidyr::drop_na()
dist_res_clean |>
group_by(de_sig) |>
summarize(mean = mean(distance), sd = sd(distance))
## # A tibble: 2 × 3
## de_sig mean sd
## <chr> <dbl> <dbl>
## 1 de 3173. 5927.
## 2 non-de 7705. 13266.
library(ggplot2)
dist_res_clean |>
filter(distance < 1e6) |>
mutate(log10_dist = log10(distance + 1)) |>
ggplot(aes(log10_dist, color=de_sig)) +
geom_density()
dist_res %>%
mutate(near_peaks = count_overlaps(., da_peaks, maxgap=100)) |>
as_tibble() |>
dplyr::count(de_sig, near_peaks)
## # A tibble: 6 × 3
## de_sig near_peaks n
## <chr> <int> <int>
## 1 de 0 664
## 2 de 1 338
## 3 de 2 4
## 4 non-de 0 12840
## 5 non-de 1 1280
## 6 non-de 2 28
library(nullranges)
da_peaks_chr <- da_peaks |>
dropSeqlevels(c("chrY","chrM")) |>
sort()
set.seed(1)
boot <- bootRanges(da_peaks_chr, blockLength=1e6, R=5)
all_peaks <- bind_ranges(
da_peaks %>% mutate(iter=0),
boot
)
tss_by_de |>
join_overlap_inner(all_peaks, maxgap=100) |>
as_tibble() |>
select(gene_id, peak_id, de_sig, iter) |>
group_by(de_sig, iter) |>
summarize(overlaps_any = n_distinct(gene_id))
## `summarise()` has grouped output by 'de_sig'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 3
## # Groups: de_sig [2]
## de_sig iter overlaps_any
## <chr> <fct> <int>
## 1 de 0 342
## 2 de 1 33
## 3 de 2 24
## 4 de 3 20
## 5 de 4 19
## 6 de 5 33
## 7 non-de 0 1308
## 8 non-de 1 407
## 9 non-de 2 390
## 10 non-de 3 369
## 11 non-de 4 373
## 12 non-de 5 378