# this script prepares eQTL results for mashr
# 1/28/2020 JR
# based on mashr vignette https://stephenslab.github.io/mashr/articles/intro_mash.html
# last editted and ran 10/7/2020 JR: cleaned up

library(ashr)
library(mashr)
library(data.table)
library(dplyr)
library(doParallel)
cores <- as.integer(Sys.getenv("SLURM_STEP_TASKS_PER_NODE"))
registerDoParallel(cores = cores)
library(RhpcBLASctl)
blas_set_num_threads(cores)

folder = "SCAIP1-6"

load(paste0("./mashr_input_",folder,".Rd"))
# subset to T cells only:
SEs <- SEs[,grep("Tcell",colnames(SEs))]
slopes <- slopes[,grep("Tcell",colnames(slopes))]
pvalues <- pvalues[,grep("Tcell",colnames(pvalues))]
# let's remove all NAs as well as Inf from SEs:
SEs <- SEs[!rowSums(is.finite(as.matrix(SEs)))<ncol(SEs),]
# remove the same rows from pvalue and slope matrixes:
slopes <- slopes[rownames(SEs),]
pvalues <- pvalues[rownames(SEs),]
# check:
sum(rowSums(is.na(as.matrix(slopes))>0))
sum(rowSums(is.na(as.matrix(pvalues))>0))
stopifnot(identical(rownames(slopes),rownames(SEs)))
stopifnot(identical(rownames(slopes),rownames(pvalues)))
# dim(SEs)
# 4019906       5
data = mash_set_data(as.matrix(slopes), as.matrix(SEs))

# 2. Set up the DATA-DRIVEN covariance matrices
# 2.1. select strong signals
m.1by1 = mash_1by1(data)
strong = get_significant_results(m.1by1,0.05)
# 2.2. Obtain initial data-driven covariance matrices
# this step estimates the covariances of the observed data
U.pca = cov_pca(data,5,subset=strong)
print(names(U.pca))
# canonical matrix:
U.c = cov_canonical(data) 
# 2.3. Apply Extreme Deconvolution
# this step estimates the covariances of the actual underlying effects
U.ed = cov_ed(data, U.pca, subset=strong)

# save the mashr input data:
save(data, U.ed,U.c, file="Tcell_mash-ready-data.Rd")

sessionInfo()