Example BWA alignment script

Example BWA alignment script

 

This file is also in $BI/scripts and is called "align_bwa.sh"

#!/bin/bash IN_FQ=$1 OUT_PFX=$2 ASSEMBLY=$3 PAIRED=$4 # show usage if we don't have 4 command-line arguments if [ "$PAIRED" == "" ]; then     echo "-----------------------------------------------------------------";     echo "Align fastq data with bwa, producing a sorted indexed BAM file.";     echo " ";     echo "align_bwa.sh in_file out_pfx assembly paired(0|1)";     echo " ";     echo "  in_file   For single-end alignments, path of the input fastq file.";     echo "            For paired-end alignemtts, path to the the R1 fastq file"     echo "            which must contain the string '_R1.' in its name. The";     echo "            corresponding 'R2' must have the same path except for '_R1'";     echo "  out_pfx   Desired prefix of output files.";     echo "  assembly  One of: hg19 hg18 mm10 mm9 sacCer3 sacCer3 ecoli";     echo "  paired    0 = single end alignment; 1 = paired end.";     echo " ";     echo "Example:";     echo "  align_bwa.sh my.fastq mrna_b1_ln1 hg18 0";     echo "  align_bwa.sh my_L001_R1.fastq swi6_b2_ln1 sacCer3 1";     exit 1; fi # general function that exits after printing its text argument #   in a standard format which can be easily grep'd. err() {   echo "$1...exiting";   exit 1; # any non-0 exit code signals an error } # function to check return code of programs. # exits with standard message if code is non-zero; # otherwise displays completiong message and date. #   arg 1 is the return code (usually $?) #   arg2 is text describing what ran ckRes() {   if [ "$1" == "0" ]; then     echo "..Done $2 `date`";   else     err "$2 returned non-0 exit code $1";   fi } # function that checks if a file exists #   arg 1 is the file name #   arg2 is text describing the file (optional) ckFile() {   if [ ! -e "$1" ]; then     err "$2 File '$1' not found";   fi } # function that checks if a file exists and #   that it has non-0 length. needed because #   programs don't always return non-0 return #   codes, and worse, they also create their #   output file with 0 length so that just #   checking for its existence is not enough #   to ensure the program ran properly ckFileSz() {   ckFile $1 $2;   SZ=`ls -l $1 | awk '{print $5}'`;   if [ "$SZ" == "0" ]; then     err "$2 file '$1' is zero length";   fi } # -------------- # Defaulting # -------------- # If aligning read pairs, find the name of the R2 file #   based on the R1 filename. if [ "$PAIRED" == "1" ]; then     IN_FQ_R1="$IN_FQ";     IN_FQ_R2=${IN_FQ_R1/_R1./_R2.}; fi # Find the path of the BWA reference based on the assembly #   name provided. Assumes a common directory structure # At TACC the structure is rooted a common BioITeam directory. #   Set WORK_COMMON appropriately outside this script if #   not running at TACC. : ${WORK_COMMON:=/corral-repl/utexas/BioITeam/} # Note: reference indexes created with bwa 0.5.9 are *not* #   compatible with bwa 0.6 or higher. So we go to some #   trouble to figure out which BWA version we have BWA_VER=`bwa 2>&1 | grep Version | awk '{print $2}' | awk -F '.' '{print $1 "." $2}'`; if [ "$BWA_VER" == "0.5" ]; then BWA_DIR=bwa; elif [ "$BWA_VER" == "0.6" ]; then BWA_DIR=bwa6; else BWA_DIR=unknown_bwa_version; fi REFBASE="$WORK_COMMON/ref_genome/$BWA_DIR/base" if [ "$ASSEMBLY" == "hg18" ]; then     REF_PFX="$REFBASE/ucsc/$ASSEMBLY/Homo_sapiens_assembly18.fasta"; elif [ "$ASSEMBLY" == "hg19" ]; then     REF_PFX="$REFBASE/ucsc/$ASSEMBLY/ucsc.hg19.fasta"; elif [ "$ASSEMBLY" == "ecoli" ]; then     REF_PFX="$REFBASE/misc/$ASSEMBLY/REL606.5.fasta"; else     REF_PFX="$REFBASE/ucsc/$ASSEMBLY/${ASSEMBLY}.fa"; fi # Read group information is part of the SAM/BAM header that desribes #   what is being aligned. When multiple lanes of data are combined #   from separate BAM files, read groups provide identification of the #   source of each read. Variant callers such as GATK depend on #   having well-defined read group information. # Here we set the RG variable to be the read group line we want #   inserted in the header. READ_GRP=$OUT_PFX; RG='@RG\tID:1\tPL:ILLUMINA\tSM:'$READ_GRP'\tDS:ref='$ASSEMBLY',pfx='$REF_PFX # Display how the program will be run, including #   defaulted arguments. Do this before running #   checks so user can see what went wrong. echo "================================================================="; echo "align_bwa.sh - `date`"; if [ "$PAIRED" == "1" ]; then echo "  fastq read1 file:  $IN_FQ_R1"; echo "  fastq read2 file:  $IN_FQ_R2"; else echo "  input file:        $IN_FQ"; fi echo "  output prefix:     $OUT_PFX"; echo "  assembly:          $ASSEMBLY"; echo "  bwa version:       $BWA_VER"; echo "  ref prefix:        $REF_PFX"; echo "  read group line:   $RG"; echo "---------------------------------------------------------"; # ------------------ # Error Checks # ------------------ # Make sure the fastq file(s) exist. # For paired end data, also make sure we have #   two different files. if [ "$PAIRED" == "1" ]; then     ckFile "$IN_FQ_R1" "Fastq read1";     ckFile "$IN_FQ_R2" "Fastq read2";     if [ "$IN_FQ_R1" == "$IN_FQ_R2" ]; then         err "Fastq read1 and read2 files are the same: '$IN_FQ_R2'";     fi else     ckFile "$IN_FQ" "Input fastq"; fi # Make sure we have found an appropriate reference #   by checking that one of the standard files exists. ckFile "${REF_PFX}.amb" "$ASSEMBLY Reference"; # Make sure version information for our two programs is #   part of our execution record. This is done by #   calling the programs with no arguments echo "---------------------------------------------------------"; echo "Program version information"; echo "---------------------------------------------------------"; bwa samtools # ------------------ # The actual work! # ------------------ if [ "$PAIRED" == "0" ]; then     echo "---------------------------------------------------------";     echo "Running bwa aln (single end reads)";     echo "---------------------------------------------------------";     bwa aln $REF_PFX $IN_FQ > $OUT_PFX.sai     ckRes $? "bwa aln";     ckFileSz "$OUT_PFX.sai";          echo "---------------------------------------------------------";     echo "Running bwa samse";     echo "---------------------------------------------------------";     bwa samse -r "$RG" $REF_PFX $OUT_PFX.sai $IN_FQ | samtools view -b -S - > $OUT_PFX.bam;     ckRes $? "bwa samse";     ckFileSz "$OUT_PFX.bam"; else     echo "---------------------------------------------------------";     echo "Running bwa aln on read1, read2 ends";     echo "`date`";     echo "---------------------------------------------------------";          echo "Aligning '$IN_FQ_R1'...";     bwa aln $REF_PFX $IN_FQ_R1 > $OUT_PFX.read1.sai;     ckRes $? "bwa aln read1";     ckFileSz "$OUT_PFX.read1.sai";              echo "Aligning '$IN_FQ_R2'...";     bwa aln $REF_PFX $IN_FQ_R2 > $OUT_PFX.read2.sai;     ckRes $? "bwa aln read2";     ckFileSz "$OUT_PFX.read2.sai";          echo "---------------------------------------------------------";     echo "Running bwa sampe";     echo "---------------------------------------------------------";     bwa sampe -r "$RG" $REF_PFX $OUT_PFX.read1.sai $OUT_PFX.read2.sai $IN_FQ_R1 $IN_FQ_R2 | samtools view -b -S - > $OUT_PFX.bam;     ckRes $? "bwa sampe";     ckFileSz "$OUT_PFX.bam"; fi echo "---------------------------------------------------------"; echo "Creating sorted, indexed bam file"; echo "---------------------------------------------------------"; echo "Sorting '$OUT_PFX.bam'..."; samtools sort $OUT_PFX.bam $OUT_PFX.sorted; ckRes $? "samtools sort"; ckFileSz "$OUT_PFX.sorted.bam"; echo "Indexing '$OUT_PFX.sorted.bam'..."; samtools index $OUT_PFX.sorted.bam; ckRes $? "samtools index"; ckFileSz "$OUT_PFX.sorted.bam.bai"; echo "---------------------------------------------------------"; echo "Collecting alignment statistics"; echo "---------------------------------------------------------"; echo "Running flagstat..."; samtools flagstat $OUT_PFX.sorted.bam | tee $OUT_PFX.flagstat.txt ckRes $? "samtools flagstat"; ckFileSz "$OUT_PFX.flagstat.txt"; # If we make it here, all went well. Exit with a standard #   message that can be easily grep'd echo "---------------------------------------------------------"; echo "All bwa alignment tasks completed successfully!"; echo "`date`"; echo "---------------------------------------------------------"; exit 0;