This is a template script written by Scott Hunicke-Smith to illustrate how to run exome analysis much faster on lonestar. It only requires two fastq files (paired files) and two parameters. It is NOT optimized, not highly robust, etc. It relies on many sub-scripts both within Scott's home directory and the BioITeam corral
directories.
This bash script needs to be run on a head node somewhere where it won't be killed:
#!/bin/bash
# Copyright 2012 Scott Hunicke-Smith and the University of Texas at Austin
module load python
module load bwa
module load samtools
module load java64
r1file=$1
r2file=$2
splitSize=$3
batchSize=$4
queue="normal"
echo "Starting: `date`"
# 1. Split input fastq's as one job; store job #
echo "split -d -l $splitSize $r1file r1.
split -d -l $splitSize $r2file r2. " > split.script
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j split.script -l split.sge -a DNAdenovo -n split -t 1:00:00
qsub split.sge &> split.sge.sublog
splitJID=`tail -1 split.sge.sublog | awk '{print $3}'`
echo "Submitted $splitJID to split input files at `date`"
echo "Waiting for split to finish"
while qstat | grep $splitJID ; do
echo `date`
sleep 30
done
# 2. Move a set of splits into their own directory
i=0
fileList=""
subdirList=""
for file in $( ls r1.* ) ; do
fileExt="${file##*.}"
if [ `expr $i % $batchSize` -eq `expr $batchSize - 1` ]
then
mkdir b.$i
subdirList="$subdirList b.$i"
fileList="$fileList $fileExt"
for dataFiles in $fileList ; do
mv r2.$dataFiles b.$i
mv r1.$dataFiles b.$i
done
fileList=""
else
fileList="$fileList $fileExt"
fi
i=`expr $i + 1`
done
# And the residual set, if any:
for file in $( ls r1.* ) ; do
mkdir b.$i
subdirList="$subdirList b.$i"
mv r1.* b.$i
mv r2.* b.$i
done
# 3. Launch exome_step1.sh on each split within it's own directory; store job numbers; launch exome_step2.sh to combine chr files
mapJIDs=""
for subdir in $subdirList ; do
cd $subdir
echo "Creating launcher for all files in $subdir: `date`"
rm -f map.sge
for file in $( ls r1.* ) ; do
fileExt="${file##*.}"
echo "Run exome_step1.sh on r1.$fileExt and r2.$fileExt"
echo "/home1/01057/sphsmith/local/bin/exome_step1.bash r1.$fileExt r2.$fileExt /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta mapped.$fileExt >& mapped.$fileExt.log" >> map.script
done
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j map.script -l map.sge -a DNAdenovo -n map.$subdir -t 1:00:00 -w 2
qsub map.sge &> map.sge.sublog
mapJID=`tail -1 map.sge.sublog | awk '{print $3}'`
mapJIDs="$mapJIDs,`tail -1 map.sge.sublog | awk '{print $3}'`"
echo "Submitted $mapJID to split input files in $subdir at `date`"
cd ..
done
echo "Waiting for mapping to finish"
while qstat | grep $mapJID ; do
echo `date`
sleep 30
done
echo "Finished: `date`"
# 4. Launch job to combine final chr files across all directories
echo "Creating launcher for merging by reference sequence: `date`"
subdir=`ls -d b.* | head -1`
subdirExt="${subdir##*.}"
evalcmd="ls b.$subdirExt/*.mapped.*.sorted.bam | awk -F [./] '{print \$3}' | sort | uniq"
refList=`eval $evalcmd`
# Randomize this list so large and small reference sequences are mixed up
refList=`echo $refList | awk 'BEGIN {srand() } {for (i=1;i<=NF;i++) {print rand() "\t" $i "\n"}}' | sort -n | cut -f 2`
echo $refList
rm -f merge.script
for refs in $refList ; do
echo "Merging $refs "
echo "samtools merge -f $refs.sorted.bam b.*/$refs.mapped.*.sorted.bam; samtools index $refs.sorted.bam" >> merge.script
done
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j merge.script -l merge.sge -a DNAdenovo -n merge -t 1:00:00 -w 4
echo "Submitting job; queue start contingent on $mapJIDs completing first"
qsub merge.sge -hold_jid $mapJIDs &> merge.sge.sublog
mergeJID=`tail -1 merge.sge.sublog | awk '{print $3}'`
echo "Submitted $mergeJID to merge output files at `date`"
echo "Waiting for merging to finish"
while qstat | grep $mergeJID ; do
echo `date`
sleep 30
done
# 5. Launch GATK on each reference sequences' sorted bam file
echo "Creating launcher for merging by reference sequence: `date`"
subdir=`ls -d b.* | head -1`
subdirExt="${subdir##*.}"
evalcmd="ls b.$subdirExt/*.mapped.*.sorted.bam | awk -F [./] '{print \$3}' | sort | uniq"
refList=`eval $evalcmd`
# Randomize this list so large and small reference sequences are mixed up
refList=`echo $refList | awk 'BEGIN {srand() } {for (i=1;i<=NF;i++) {print rand() "\t" $i "\n"}}' | sort -n | cut -f 2`
echo $refList
rm -f variants.script
for refs in $refList ; do
echo "GATK via exome_step2.bash on $refs.sorted.bam"
echo "/home1/01057/sphsmith/local/bin/exome_step2.bash $refs.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf $refs >& variants.$refs.log" >> variants.script
done
# Note that the -w 2 option here defines how many GATK's run per node - might need optimization
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j variants.script -l variants.sge -a DNAdenovo -n variants -t 4:00:00 -w 2
sed -i s/'module load launcher'/'module load launcher\nmodule load java64\nmodule load samtools'/ variants.sge
qsub variants.sge -hold_jid $mergeJID &> variants.sge.sublog
variantsJID=`tail -1 variants.sge.sublog | awk '{print $3}'`
echo "Submitted $variantsJID to call variants at `date`"
echo "Waiting for variant calling to finish"
while qstat | grep $variantsJID ; do
echo `date`
sleep 30
done
# 6. Merge all bam files & vcf files
echo "samtools merge -f $r1file.sorted.bam *.sorted.bam" > merge2.script
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j merge2.script -l merge2.sge -a DNAdenovo -n merge2 -t 1:00:00 -w 4
qsub merge2.sge -hold_jid $variantsJID &> merge2.sge.sublog
merge2JID=`tail -1 merge2.sge.sublog | awk '{print $3}'`
echo "Submitted $merge2JID to merge output files at `date`"
echo "Waiting for merging to finish"
while qstat | grep $merge2JID ; do
echo `date`
sleep 30
done
grep '^#' chrX.sorted.bam.snps.vcf > $r1file.snps.vcf
grep -v '^#' chr*.sorted.bam.snps.vcf >> $r1file.snps.vcf
echo "Fast exon analysis is complete at: `date`"
It uses the TACC "launcher" functionality to do the following:
- Create one job on one node which splits the two input fastq files into files with
$splitSize
lines per file each using split.sge
and split.script
. Wait for job to finish.
- Create as many subdirectories as needed for the split output files to be mapped
$batchSize
per directory. Lonestar nodes have two sockets with 12 processors per socket, so a good choice here is to make $batchSize
two so that the mapping step can use 6 threads.
- Create
$batchSize * originalFileSize % $splitSize
lines in map.script
and submit map.sge
to do the mapping. Note that the embedded mapping script exome_step1.sh
splits the mapping output into chromosome-specific files during the bwa sampe
step. This mapping script is also where multi-threading for bwa is set. It should be parameterized of course.
- Merge all the chromosome-specific files from these subdirectories back to the run directory using
merge.sge
and merge.script
- Run GATK on sets of these chromosome-specific files, with 2 GATK's per node (hardcoded in script right now) using
variants.sge
and variants.script
; since chromosomes are usually named based on their size (i.e. chr1 < chr2 < chr3, etc.), randomize the list so that we don't wind up with all the big chromosomes on one node.
- Merge the final GATK chromosome-specific variant calls - both the BAM files and the VCF files - using
merge2.sge
and merge2.script
.
Examples of the various .sge
and .script
files are shown below.
Benchmark analysis on ~40 million read pairs from a single human exome experiment show that this script takes about 2 hours vs. about 15 hours if all these same processes are run on only 1 node.
Expand here to see example split.sge and split.script
#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory. Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
# <------ Setup Parameters ------>
#
#$ -N split
#$ -pe 12way 12
#$ -q normal
#$ -o split.o$JOB_ID
#$ -l h_rt=1:00:00
#$ -V
#$ -cwd
# <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
# #$ -pe <parallel environment> <number of slots>
# #$ -l h_rt=hours:minutes:seconds to specify run time limit
# #$ -N <job name>
# #$ -q <queue name>
# #$ -o <job output file>
# NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
setenv EXECUTABLE $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE split.script
setenv WORKDIR .
#
# Variable description:
#
# EXECUTABLE = full path to the job launcher executable
# CONTROL_FILE = text input file which specifies
# executable for each process
# (should be located in WORKDIR)
# WORKDIR = location of working directory
#
# <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------
#----------------
# Error Checking
#----------------
if ( ! -e $WORKDIR ) then
echo " "
echo "Error: unable to change to working directory."
echo " $WORKDIR"
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $EXECUTABLE ) then
echo " "
echo "Error: unable to find launcher executable $EXECUTABLE."
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $WORKDIR/$CONTROL_FILE ) then
echo " "
echo "Error: unable to find input control file $CONTROL_FILE."
echo " "
echo "Job not submitted."
exit
endif
#----------------
# Job Submission
#----------------
cd $WORKDIR/
echo " WORKING DIR: $WORKDIR/"
$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE
echo " "
echo " Parameteric Job Complete"
echo " "
*********************
split -d -l 1500000 Sample_5_L003_R1.cat.fastq r1.
split -d -l 1500000 Sample_5_L003_R2.cat.fastq r2.
Expand here to see example merge.sge and merge.script
#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory. Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
# <------ Setup Parameters ------>
#
#$ -N merge
#$ -pe 4way 72
#$ -q normal
#$ -o merge.o$JOB_ID
#$ -l h_rt=1:00:00
#$ -V
#$ -cwd
# <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
# #$ -pe <parallel environment> <number of slots>
# #$ -l h_rt=hours:minutes:seconds to specify run time limit
# #$ -N <job name>
# #$ -q <queue name>
# #$ -o <job output file>
# NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
setenv EXECUTABLE $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE merge.script
setenv WORKDIR .
#
# Variable description:
#
# EXECUTABLE = full path to the job launcher executable
# CONTROL_FILE = text input file which specifies
# executable for each process
# (should be located in WORKDIR)
# WORKDIR = location of working directory
#
# <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------
#----------------
# Error Checking
#----------------
if ( ! -e $WORKDIR ) then
echo " "
echo "Error: unable to change to working directory."
echo " $WORKDIR"
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $EXECUTABLE ) then
echo " "
echo "Error: unable to find launcher executable $EXECUTABLE."
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $WORKDIR/$CONTROL_FILE ) then
echo " "
echo "Error: unable to find input control file $CONTROL_FILE."
echo " "
echo "Job not submitted."
exit
endif
#----------------
# Job Submission
#----------------
cd $WORKDIR/
echo " WORKING DIR: $WORKDIR/"
$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE
echo " "
echo " Parameteric Job Complete"
echo " "
samtools merge -f chr6.sorted.bam b.*/chr6.mapped.*.sorted.bam; samtools index chr6.sorted.bam
samtools merge -f chrX.sorted.bam b.*/chrX.mapped.*.sorted.bam; samtools index chrX.sorted.bam
samtools merge -f chr17.sorted.bam b.*/chr17.mapped.*.sorted.bam; samtools index chr17.sorted.bam
samtools merge -f chr21.sorted.bam b.*/chr21.mapped.*.sorted.bam; samtools index chr21.sorted.bam
samtools merge -f chr5.sorted.bam b.*/chr5.mapped.*.sorted.bam; samtools index chr5.sorted.bam
samtools merge -f chrY.sorted.bam b.*/chrY.mapped.*.sorted.bam; samtools index chrY.sorted.bam
samtools merge -f chr4.sorted.bam b.*/chr4.mapped.*.sorted.bam; samtools index chr4.sorted.bam
samtools merge -f chr19.sorted.bam b.*/chr19.mapped.*.sorted.bam; samtools index chr19.sorted.bam
samtools merge -f chr13.sorted.bam b.*/chr13.mapped.*.sorted.bam; samtools index chr13.sorted.bam
samtools merge -f chr16.sorted.bam b.*/chr16.mapped.*.sorted.bam; samtools index chr16.sorted.bam
samtools merge -f chr7.sorted.bam b.*/chr7.mapped.*.sorted.bam; samtools index chr7.sorted.bam
samtools merge -f chr9.sorted.bam b.*/chr9.mapped.*.sorted.bam; samtools index chr9.sorted.bam
samtools merge -f chr14.sorted.bam b.*/chr14.mapped.*.sorted.bam; samtools index chr14.sorted.bam
samtools merge -f chr11.sorted.bam b.*/chr11.mapped.*.sorted.bam; samtools index chr11.sorted.bam
samtools merge -f chr22.sorted.bam b.*/chr22.mapped.*.sorted.bam; samtools index chr22.sorted.bam
samtools merge -f chr1.sorted.bam b.*/chr1.mapped.*.sorted.bam; samtools index chr1.sorted.bam
samtools merge -f chr10.sorted.bam b.*/chr10.mapped.*.sorted.bam; samtools index chr10.sorted.bam
samtools merge -f chr15.sorted.bam b.*/chr15.mapped.*.sorted.bam; samtools index chr15.sorted.bam
samtools merge -f chr18.sorted.bam b.*/chr18.mapped.*.sorted.bam; samtools index chr18.sorted.bam
samtools merge -f chr3.sorted.bam b.*/chr3.mapped.*.sorted.bam; samtools index chr3.sorted.bam
samtools merge -f chr20.sorted.bam b.*/chr20.mapped.*.sorted.bam; samtools index chr20.sorted.bam
samtools merge -f chr8.sorted.bam b.*/chr8.mapped.*.sorted.bam; samtools index chr8.sorted.bam
samtools merge -f chr2.sorted.bam b.*/chr2.mapped.*.sorted.bam; samtools index chr2.sorted.bam
samtools merge -f chr12.sorted.bam b.*/chr12.mapped.*.sorted.bam; samtools index chr12.sorted.bam
Expand here to see example map.sge and map.script; note that the fastExon.sh script creates these within subdirectories.
#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory. Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
# <------ Setup Parameters ------>
#
#$ -N map.b.1
#$ -pe 2way 12
#$ -q normal
#$ -o map.b.1.o$JOB_ID
#$ -l h_rt=1:00:00
#$ -V
#$ -cwd
# <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
# #$ -pe <parallel environment> <number of slots>
# #$ -l h_rt=hours:minutes:seconds to specify run time limit
# #$ -N <job name>
# #$ -q <queue name>
# #$ -o <job output file>
# NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
setenv EXECUTABLE $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE map.script
setenv WORKDIR .
#
# Variable description:
#
# EXECUTABLE = full path to the job launcher executable
# CONTROL_FILE = text input file which specifies
# executable for each process
# (should be located in WORKDIR)
# WORKDIR = location of working directory
#
# <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------
#----------------
# Error Checking
#----------------
if ( ! -e $WORKDIR ) then
echo " "
echo "Error: unable to change to working directory."
echo " $WORKDIR"
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $EXECUTABLE ) then
echo " "
echo "Error: unable to find launcher executable $EXECUTABLE."
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $WORKDIR/$CONTROL_FILE ) then
echo " "
echo "Error: unable to find input control file $CONTROL_FILE."
echo " "
echo "Job not submitted."
exit
endif
#----------------
# Job Submission
#----------------
cd $WORKDIR/
echo " WORKING DIR: $WORKDIR/"
$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE
echo " "
echo " Parameteric Job Complete"
echo " "
/home1/01057/sphsmith/local/bin/exome_step1.bash r1.00 r2.00 /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta mapped.00 >& mapped.00.log
/home1/01057/sphsmith/local/bin/exome_step1.bash r1.01 r2.01 /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta mapped.01 >& mapped.01.log
Expand here to see example variants.sge and variants.script
#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory. Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
# <------ Setup Parameters ------>
#
#$ -N variants
#$ -pe 2way 144
#$ -q normal
#$ -o variants.o$JOB_ID
#$ -l h_rt=4:00:00
#$ -V
#$ -cwd
# <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
# #$ -pe <parallel environment> <number of slots>
# #$ -l h_rt=hours:minutes:seconds to specify run time limit
# #$ -N <job name>
# #$ -q <queue name>
# #$ -o <job output file>
# NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
module load java64
module load samtools
setenv EXECUTABLE $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE variants.script
setenv WORKDIR .
#
# Variable description:
#
# EXECUTABLE = full path to the job launcher executable
# CONTROL_FILE = text input file which specifies
# executable for each process
# (should be located in WORKDIR)
# WORKDIR = location of working directory
#
# <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------
#----------------
# Error Checking
#----------------
if ( ! -e $WORKDIR ) then
echo " "
echo "Error: unable to change to working directory."
echo " $WORKDIR"
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $EXECUTABLE ) then
echo " "
echo "Error: unable to find launcher executable $EXECUTABLE."
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $WORKDIR/$CONTROL_FILE ) then
echo " "
echo "Error: unable to find input control file $CONTROL_FILE."
echo " "
echo "Job not submitted."
exit
endif
#----------------
# Job Submission
#----------------
cd $WORKDIR/
echo " WORKING DIR: $WORKDIR/"
$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE
echo " "
echo " Parameteric Job Complete"
echo " "
/home1/01057/sphsmith/local/bin/exome_step2.bash chrY.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chrY >& variants.chrY.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr22.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr22 >& variants.chr22.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr9.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr9 >& variants.chr9.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr3.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr3 >& variants.chr3.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr21.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr21 >& variants.chr21.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr5.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr5 >& variants.chr5.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr16.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr16 >& variants.chr16.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr19.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr19 >& variants.chr19.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr18.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr18 >& variants.chr18.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr4.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr4 >& variants.chr4.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr12.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr12 >& variants.chr12.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr15.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr15 >& variants.chr15.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr14.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr14 >& variants.chr14.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chrX.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chrX >& variants.chrX.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr6.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr6 >& variants.chr6.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr13.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr13 >& variants.chr13.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr8.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr8 >& variants.chr8.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr7.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr7 >& variants.chr7.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr11.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr11 >& variants.chr11.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr20.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr20 >& variants.chr20.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr10.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr10 >& variants.chr10.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr1.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr1 >& variants.chr1.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr2.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr2 >& variants.chr2.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr17.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr17 >& variants.chr17.log
Expand here to see example merge2.sge and merge2.script
#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory. Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
# <------ Setup Parameters ------>
#
#$ -N merge2
#$ -pe 4way 12
#$ -q normal
#$ -o merge2.o$JOB_ID
#$ -l h_rt=1:00:00
#$ -V
#$ -cwd
# <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
# #$ -pe <parallel environment> <number of slots>
# #$ -l h_rt=hours:minutes:seconds to specify run time limit
# #$ -N <job name>
# #$ -q <queue name>
# #$ -o <job output file>
# NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
setenv EXECUTABLE $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE merge2.script
setenv WORKDIR .
#
# Variable description:
#
# EXECUTABLE = full path to the job launcher executable
# CONTROL_FILE = text input file which specifies
# executable for each process
# (should be located in WORKDIR)
# WORKDIR = location of working directory
#
# <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------
#----------------
# Error Checking
#----------------
if ( ! -e $WORKDIR ) then
echo " "
echo "Error: unable to change to working directory."
echo " $WORKDIR"
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $EXECUTABLE ) then
echo " "
echo "Error: unable to find launcher executable $EXECUTABLE."
echo " "
echo "Job not submitted."
exit
endif
if ( ! -f $WORKDIR/$CONTROL_FILE ) then
echo " "
echo "Error: unable to find input control file $CONTROL_FILE."
echo " "
echo "Job not submitted."
exit
endif
#----------------
# Job Submission
#----------------
cd $WORKDIR/
echo " WORKING DIR: $WORKDIR/"
$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE
echo " "
echo " Parameteric Job Complete"
echo " "
samtools merge -f Sample_5_L003_R1.cat.fastq.sorted.bam *.sorted.bam
Welcome to the University Wiki Service! Please use your IID (yourEID@eid.utexas.edu) when prompted for your email address during login or click here to enter your EID. If you are experiencing any issues loading content on pages, please try these steps to clear your browser cache.