##
## The following makefile controls the pipeline described in 
## Moyerbrailean et al (http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1005875)
## Wayne State University, 2016
##

## Required software
## * samtools
## * bedtools
## * UCSC utils (http://hgdownload.soe.ucsc.edu/admin/exe/)

## Custom utils
## * scanPwm
## * scanPwmVar
## * makeXbFile
## * bedXbFileXtract

# Folders
dnaseFolder  = dnase/
seedFolder   = pwms/seed/
t5kBedFolder = $(seedFolder)pwmScan.sites/
t5kSeqFolder = $(seedFolder)pwmScan.seq/
pwmFolder    = pwms/recal/pwmFiles.new/
selFolder    = pwms/selPwms/
initCenti    = initial/
tmp          = /dev/shm/ # Any temporary storage directory

# Files
1kGenomes     = genome/variants.txt.gz # Tab separated, chr / pos / ref allele / alt allele
genome        = genome/hg19.fa    # Genome in fasta format
genome2bit    = genome/hg19.2bit  # Genome in 2bit format
genometmp2bit = $(tmp)/hg19.2bit
chromSizes    = genome/chromSizes.hg19.txt # From UCSC

# Scripts
lambdaScript    = src/parseCentipedeTop5K.Lambda.v2.gmb.R
pwmScript       = src/parseCentipedeTop5K.Pwm.v2.gmb.R
betaScript      = src/parseCentipedeTop5K.AllBeta.v5.gmb.R
initCentiScript = src/runCentipede.R
fullCentiScript = src/runCentipedeOnAll.R

# Misc
awkDnaseStr = '{if(and($$2,0x10)==0){print $$3,$$4-1,"+"};if(and($$2,0x10)==16){print $$3,$$4-1+length($$10)-1,"-"}}'

# Parameters
ncpus       = 10
zThresh     = 5
numJobs     = 60
numJobsFull = 42

# Helper commands
.SECONDARY:

%.Folder:
	mkdir -p $*;

$(genometmp2bit): $(genome2bit)
	cp $(genome2bit) $(genometmp2bit)

%.Qsub64:
	touch $@
	echo "cd $${PWD}; make $*" | qsub -q masxq -l nodes=1:ppn=64 -o $@ -e $@.e

%.Qsub12:
	touch $@
	echo "cd $${PWD}; make $*" | qsub -q mmtxq -l nodes=1:ppn=12 -o $@ -e $@.e


######################
## Initial PWM scan ##
######################

## Using the seed motifs, generate the training sites using 'scanPwm'
## 1) scan pwm human, choose top-scoring matches
## 2) scan pwm chimp, liftover top-scoring matches (not in [1])
## 3) scan pwm macaque, liftover top-scoring matches (not in [1,2])
## 4) combine sequences from [1,2,3], use for next step

# Extract the sequences at the Top5K sites
$(seedFolder)pwmScan.seq/%.seq.gz: $(t5kBedFolder)/%.bed.gz $(genometmp2bit)
	less $< | seqBedFor2bit -noMask -window=20 $(genometmp2bit) stdin stdout | gzip > $@
all: ${patsubst $(pwmFolder)/%.bed.gz,%.seq.gz,${wildcard $(pwmFolder)/*.bed.gz}}


############################
## Initial CENTIPEDE scan ##
############################

# Convert DNase-seq data to x8b files
$(dnaseFolder)%.x8b.bz2: $(dnaseFolder)%.bam
	samtools view -q10 $^ | awk -F"\t" $(awkDnaseStr) | makeXbFile $(chromSizes) -readSize=0 stdin $(tmp)/$*.x8b -verbose=2
	pbzip2 -b1m2000vp$${NCPUS:-1} $(tmp)/$*.x8b
	mv $(tmp)/$*.x8b.bz2 $@
allDnase: $(patsubst $(dnaseFolder)%.bam, $(dnaseFolder)%.x8b.bz2.Qsub12, $(wildcard $(dnaseFolder)*.bam))

# Get the coverage for each sample
$(dnaseFolder)numReads.txt:
	grep '... completed with ' *.Qsub12.e | sed 's/:.*with /\t/g' | sed 's/ reads//g' > $@

# Run CENTIPEDE for a Tissue-TF pair (see below)
$(initCenti)$(expName)/%.out.gz:
	-R --vanilla --args $* $(expName) ./$(initCenti) < $(initCentiScript) | gzip > $@ 2> $@.err
$(expName).mk: $(tmp)/$(expName).x8b $(patsubst $(t5kBedFolder)/%.bed.gz, $(initCenti)$(expName)/%.out.gz, $(wildcard $(t5kBedFolder)/*.bed.gz))

# Submit a tissue to run on its own node (preferred method)
$(initCenti)%.exp: $(dnaseFolder)/%.x8b.bz2
	mkdir -p $(initCenti)$*
	cp $^ $(tmp)
	pbzip2 -dm1000vp$(numJobs) $(tmp)/$*.x8b.bz2
	make -j $(numJobs) $*.mk expName=$*
	rm -f $(tmp)$*.x8b
initQsub: $(patsubst $(dnaseFolder)/%.x8b.bz2, $(initCenti)%.exp.Qsub64, $(wildcard $(dnaseFolder)/*.x8b.bz2))

#############################
## PWM model Recalibration ##
#############################

# Parse out the CENTIPEDE data necessary for PWM recalibration
$(initCenti)combo/%.lambdaParse.gz:
	-R --vanilla --args $* < $(lambdaScript) | gzip > $@ 2> $@.err
comboParse: $(initCenti)combo.Folder $(patsubst $(t5kBedFolder)/%.bed.gz, $(initCenti)combo/%.lambdaParse.gz, $(wildcard $(t5kBedFolder)/*.bed.gz))

# Recalibration of PWM motif models
$(pwmFolder)%.txt.gz: $(pwmFolder).Folder
	-R --vanilla --args $* $(seedFolder) $(pwmFolder) < $(pwmScript) | gzip > $@ 2> $@.err &

# Identification of active tissue-TF pairs
selNames.txt: selPwms.Folder
	-R --vanilla --args $(zThresh) < $(betaScript) > selPwms/output.txt 2> selPwms/output.txt.e;
	cat selPwms/*.pwms.txt | sort -k 1 | uniq > $@


##########################
## Genome-wide PWM scan ##
##########################

# Scan the genome for all matches above a threshold
pwms/recal/pwmRescan.sites/%.bed.gz: $(pwmFolder)/%.pwm
	T=`less $^ | grep -w '#p.Tcut2' | cut -f5`; \
	scanPwm $^ $(genome) -t=$$T -base=2 -omp=$(ncpus) | awk -v OFS='\t' '{print $$1,$$2,$$3,"$*",$$5,$$4}' | bedSort stdin stdout | gzip > $@

# Scan the genome again, this time considering 1KG variants
pwms/recal/pwmSNPs.sites/%.bed.gz: $(pwmFolder)/%.pwm
	T=`less $^ | grep -w '#p.Tcut2' | cut -f5`; \
	scanPwmVar $^ $(genome) $(1kGenomes) -base=2 -t=$$T -omp=$(ncpus) | bedSort stdin stdout | gzip > $@ 2>$@.e

# Run the scans for motifs with active footprints in >=1 tissues
all: selNames.txt $(genome)
	cat $< | while read f; do \
		${MAKE} pwms/recal/pwmRescan.sites/$${f}.bed.gz; \
		${MAKE} pwms/recal/pwmSNPs.sites/$${f}.bed.gz; \
	done
	rm $(genome)


#########################
## Full CENTIPEDE scan ##
#########################

# Basic job for a given tissue-TF pair
full/res/$(expName)/%.out.gz: full/res/$(expName).Folder
	-nice -n10 ionice -c2 -n5 ~/piquelab/bin/R --vanilla --args $* $(expName) ./ < $(fullCentiScript) 2> $@.err | gzip > $@

# Submit a tissue to run on its own node
full/res/%.exp: $(dnaseFolder)/%.x8b.bz2
	hostname
	mkdir -p full/res/$*
	cp $^ $(tmp)/
	pbzip2 -dm1000fvp$(numJobsFull) $(tmp)/$*.x8b.bz2
	ionice -c2 -n5 make -j $(numJobsFull) full/res/$*.mk expName=$*
	rm -f $(tmp)/$*.x8b
fullQsub: $(patsubst $(dnaseFolder)/%.x8b.bz2, full/res/%.exp.Qsub64, $(wildcard $(dnaseFolder)/*.x8b.bz2))