## Simulation of RNA-Seq data genes <- paste("gene_",10001:24000,sep="") samples <- c(paste("samp_0",1:9,sep=""),paste("samp_",10:48,sep="")) strain <- factor(rep(paste("s_",1:8,sep=""),6)) pheno <- factor(rep(rep(c("slow","fast"),each=4),6)) time <- factor(rep(c(0,15,90),each=16)) colData <- data.frame(samples,strain,pheno,time) str(colData) colData set.seed(123) ## Base value of qij is beta_i0 and depends on gene beta0 <- rnorm(14000,9,2) log2q <- matrix(rep(beta0,48),ncol=48) log2q[1:5,1:5] ## some genes are up or down regulated in fast or in slow strains genes.fast <- sample(1:14000,50) genes.slow <- sample(1:14000,50) ## some genes are up or down regulated in fast strains after 15 min. genes.fast.15 <- sample(1:14000,50) ## sample which genes are up or down regulated for certain strains genes.strain <- list() for(s in 1:8) { genes.strain[[s]] <- sample(1:14000,500) } ## now change log2q according to the sampled genes. The effect can depend on ## the gene. Here we sample these effects from normal distributions. gene.fast.effects <- rnorm(50,3,1) gene.slow.effects <- rnorm(50,-1,2) log2q[genes.fast,pheno=="fast"] <- log2q[genes.fast,pheno=="fast"]+matrix(rep(gene.fast.effects,24),ncol=24) log2q[genes.slow,pheno=="slow"] <- log2q[genes.slow,pheno=="slow"]+matrix(rep(gene.slow.effects,24),ncol=24) gene.fast.15.effects <- rnorm(50,2,2) gene.slow.15.effects <- rnorm(50,-1,3) log2q[genes.fast,pheno=="fast" & time=="15"] <- log2q[genes.fast,pheno=="fast" & time=="15"] + matrix(rep(gene.fast.15.effects,8),ncol=8) log2q[genes.slow,pheno=="slow" & time=="15"] <- log2q[genes.slow,pheno=="slow" & time=="15"] + matrix(rep(gene.slow.15.effects,8),ncol=8) genes.strain.effects <- list() for(s in 1:8) { genes.strain.effects[[s]] <- rnorm(500,0,2) log2q[genes.strain[[s]],strain==paste("s_",s,sep="")] <- log2q[genes.strain[[s]],strain==paste("s_",s,sep="")] + matrix(rep(genes.strain.effects[[s]],6),ncol=6) } ## sj depends on sample (coverage). Let's draw them from an exponential distrib s <- rexp(48,30) matrix(rep(s,14000),ncol=48,byrow=TRUE)[1:5,1:5] mu <- (2^log2q)*matrix(rep(s,14000),ncol=48,byrow=TRUE) ## use log-normal prior for gene-specific dispersion parameters: alpha <- exp(rnorm(14000,1,0.1)) ## finaly sample gene counts from negative binomial distribution: K <- matrix(rnbinom(14000*48,mu=mu,size=rep(1/alpha,48)),ncol=48) dimnames(K) <- list(genes,samples) K[1:10,1:10] library(DESeq2) dds <- DESeqDataSetFromMatrix(countData= K, colData= colData, design= ~ pheno + time + pheno:time) dds1 <- DESeq(dds) ## the following is for the case time = "0" res <- results(dds1,contrast=c("pheno","fast","slow")) summary(res) res res[order(res$padj),] as.data.frame(res[order(res$padj),][1:80,]) res[!is.na(res$padj) & res$padj<0.05,] as.data.frame(res[!is.na(res$padj) & res$padj<0.05,]) ## equivalent: resultsNames(dds1) res <- results(dds1,name="pheno_slow_vs_fast") summary(res) res res[order(res$padj),] as.data.frame(res[order(res$padj),][1:80,]) res[!is.na(res$padj) & res$padj<0.05,] as.data.frame(res[!is.na(res$padj) & res$padj<0.05,]) ## now for time = "15"; Is interaction term for slow:"15" significant? ## Baseline is case fast:"0". ## (is this right??? Please let me know if not right.) res <- results(dds1,name="phenoslow.time15") summary(res) res res[order(res$padj),] as.data.frame(res[order(res$padj),][1:80,]) res[!is.na(res$padj) & res$padj<0.05,] as.data.frame(res[!is.na(res$padj) & res$padj<0.05,]) ## Find genes for which interaction pheno:time is significant: lrt <- nbinomLRT(dds1, reduced = ~ pheno + time) res.lrt <- results(lrt) as.data.frame(res.lrt[!is.na(res.lrt$padj) & res.lrt$padj<0.05,])