Example BWA alignment script

 

This file is also in $BI/scripts and is called "align_bwa.sh"

#!/bin/bash

IN_FQ=$1
OUT_PFX=$2
ASSEMBLY=$3
PAIRED=$4

# show usage if we don't have 4 command-line arguments
if [ "$PAIRED" == "" ]; then
    echo "-----------------------------------------------------------------";
    echo "Align fastq data with bwa, producing a sorted indexed BAM file.";
    echo " ";
    echo "align_bwa.sh in_file out_pfx assembly paired(0|1)";
    echo " ";
    echo "  in_file   For single-end alignments, path of the input fastq file.";
    echo "            For paired-end alignemtts, path to the the R1 fastq file"
    echo "            which must contain the string '_R1.' in its name. The";
    echo "            corresponding 'R2' must have the same path except for '_R1'";
    echo "  out_pfx   Desired prefix of output files.";
    echo "  assembly  One of: hg19 hg18 mm10 mm9 sacCer3 sacCer3 ecoli";
    echo "  paired    0 = single end alignment; 1 = paired end.";
    echo " ";
    echo "Example:";
    echo "  align_bwa.sh my.fastq mrna_b1_ln1 hg18 0";
    echo "  align_bwa.sh my_L001_R1.fastq swi6_b2_ln1 sacCer3 1";
    exit 1;
fi

# general function that exits after printing its text argument
#   in a standard format which can be easily grep'd.
err() {
  echo "$1...exiting";
  exit 1; # any non-0 exit code signals an error
}
# function to check return code of programs.
# exits with standard message if code is non-zero;
# otherwise displays completiong message and date.
#   arg 1 is the return code (usually $?)
#   arg2 is text describing what ran
ckRes() {
  if [ "$1" == "0" ]; then
    echo "..Done $2 `date`";
  else
    err "$2 returned non-0 exit code $1";
  fi
}
# function that checks if a file exists
#   arg 1 is the file name
#   arg2 is text describing the file (optional)
ckFile() {
  if [ ! -e "$1" ]; then
    err "$2 File '$1' not found";
  fi
}
# function that checks if a file exists and
#   that it has non-0 length. needed because
#   programs don't always return non-0 return
#   codes, and worse, they also create their
#   output file with 0 length so that just
#   checking for its existence is not enough
#   to ensure the program ran properly
ckFileSz() {
  ckFile $1 $2;
  SZ=`ls -l $1 | awk '{print $5}'`;
  if [ "$SZ" == "0" ]; then
    err "$2 file '$1' is zero length";
  fi
}

# --------------
# Defaulting
# --------------
# If aligning read pairs, find the name of the R2 file
#   based on the R1 filename.
if [ "$PAIRED" == "1" ]; then
    IN_FQ_R1="$IN_FQ";
    IN_FQ_R2=${IN_FQ_R1/_R1./_R2.};
fi

# Find the path of the BWA reference based on the assembly
#   name provided. Assumes a common directory structure
# At TACC the structure is rooted a common BioITeam directory.
#   Set WORK_COMMON appropriately outside this script if
#   not running at TACC.
: ${WORK_COMMON:=/corral-repl/utexas/BioITeam/}
# Note: reference indexes created with bwa 0.5.9 are *not*
#   compatible with bwa 0.6 or higher. So we go to some
#   trouble to figure out which BWA version we have
BWA_VER=`bwa 2>&1 | grep Version | awk '{print $2}' | awk -F '.' '{print $1 "." $2}'`;
if [ "$BWA_VER" == "0.5" ]; then BWA_DIR=bwa;
elif [ "$BWA_VER" == "0.6" ]; then BWA_DIR=bwa6;
else BWA_DIR=unknown_bwa_version; fi
REFBASE="$WORK_COMMON/ref_genome/$BWA_DIR/base"
if [ "$ASSEMBLY" == "hg18" ]; then
    REF_PFX="$REFBASE/ucsc/$ASSEMBLY/Homo_sapiens_assembly18.fasta";
elif [ "$ASSEMBLY" == "hg19" ]; then
    REF_PFX="$REFBASE/ucsc/$ASSEMBLY/ucsc.hg19.fasta";
elif [ "$ASSEMBLY" == "ecoli" ]; then
    REF_PFX="$REFBASE/misc/$ASSEMBLY/REL606.5.fasta";
else
    REF_PFX="$REFBASE/ucsc/$ASSEMBLY/${ASSEMBLY}.fa";
fi

# Read group information is part of the SAM/BAM header that desribes
#   what is being aligned. When multiple lanes of data are combined
#   from separate BAM files, read groups provide identification of the
#   source of each read. Variant callers such as GATK depend on
#   having well-defined read group information.
# Here we set the RG variable to be the read group line we want
#   inserted in the header.
READ_GRP=$OUT_PFX;
RG='@RG\tID:1\tPL:ILLUMINA\tSM:'$READ_GRP'\tDS:ref='$ASSEMBLY',pfx='$REF_PFX

# Display how the program will be run, including
#   defaulted arguments. Do this before running
#   checks so user can see what went wrong.
echo "=================================================================";
echo "align_bwa.sh - `date`";
if [ "$PAIRED" == "1" ]; then
echo "  fastq read1 file:  $IN_FQ_R1";
echo "  fastq read2 file:  $IN_FQ_R2";
else
echo "  input file:        $IN_FQ";
fi
echo "  output prefix:     $OUT_PFX";
echo "  assembly:          $ASSEMBLY";
echo "  bwa version:       $BWA_VER";
echo "  ref prefix:        $REF_PFX";
echo "  read group line:   $RG";
echo "---------------------------------------------------------";

# ------------------
# Error Checks
# ------------------
# Make sure the fastq file(s) exist.
# For paired end data, also make sure we have
#   two different files.
if [ "$PAIRED" == "1" ]; then
    ckFile "$IN_FQ_R1" "Fastq read1";
    ckFile "$IN_FQ_R2" "Fastq read2";
    if [ "$IN_FQ_R1" == "$IN_FQ_R2" ]; then
        err "Fastq read1 and read2 files are the same: '$IN_FQ_R2'";
    fi
else
    ckFile "$IN_FQ" "Input fastq";
fi

# Make sure we have found an appropriate reference
#   by checking that one of the standard files exists.
ckFile "${REF_PFX}.amb" "$ASSEMBLY Reference";

# Make sure version information for our two programs is
#   part of our execution record. This is done by
#   calling the programs with no arguments
echo "---------------------------------------------------------";
echo "Program version information";
echo "---------------------------------------------------------";
bwa
samtools

# ------------------
# The actual work!
# ------------------

if [ "$PAIRED" == "0" ]; then
    echo "---------------------------------------------------------";
    echo "Running bwa aln (single end reads)";
    echo "---------------------------------------------------------";
    bwa aln $REF_PFX $IN_FQ > $OUT_PFX.sai
    ckRes $? "bwa aln";
    ckFileSz "$OUT_PFX.sai";
    
    echo "---------------------------------------------------------";
    echo "Running bwa samse";
    echo "---------------------------------------------------------";
    bwa samse -r "$RG" $REF_PFX $OUT_PFX.sai $IN_FQ | samtools view -b -S - > $OUT_PFX.bam;
    ckRes $? "bwa samse";
    ckFileSz "$OUT_PFX.bam";
else
    echo "---------------------------------------------------------";
    echo "Running bwa aln on read1, read2 ends";
    echo "`date`";
    echo "---------------------------------------------------------";
    
    echo "Aligning '$IN_FQ_R1'...";
    bwa aln $REF_PFX $IN_FQ_R1 > $OUT_PFX.read1.sai;
    ckRes $? "bwa aln read1";
    ckFileSz "$OUT_PFX.read1.sai";
        
    echo "Aligning '$IN_FQ_R2'...";
    bwa aln $REF_PFX $IN_FQ_R2 > $OUT_PFX.read2.sai;
    ckRes $? "bwa aln read2";
    ckFileSz "$OUT_PFX.read2.sai";
    
    echo "---------------------------------------------------------";
    echo "Running bwa sampe";
    echo "---------------------------------------------------------";
    bwa sampe -r "$RG" $REF_PFX $OUT_PFX.read1.sai $OUT_PFX.read2.sai $IN_FQ_R1 $IN_FQ_R2 | samtools view -b -S - > $OUT_PFX.bam;
    ckRes $? "bwa sampe";
    ckFileSz "$OUT_PFX.bam";
fi

echo "---------------------------------------------------------";
echo "Creating sorted, indexed bam file";
echo "---------------------------------------------------------";
echo "Sorting '$OUT_PFX.bam'...";
samtools sort $OUT_PFX.bam $OUT_PFX.sorted;
ckRes $? "samtools sort";
ckFileSz "$OUT_PFX.sorted.bam";

echo "Indexing '$OUT_PFX.sorted.bam'...";
samtools index $OUT_PFX.sorted.bam;
ckRes $? "samtools index";
ckFileSz "$OUT_PFX.sorted.bam.bai";

echo "---------------------------------------------------------";
echo "Collecting alignment statistics";
echo "---------------------------------------------------------";
echo "Running flagstat...";
samtools flagstat $OUT_PFX.sorted.bam | tee $OUT_PFX.flagstat.txt
ckRes $? "samtools flagstat";
ckFileSz "$OUT_PFX.flagstat.txt";

# If we make it here, all went well. Exit with a standard
#   message that can be easily grep'd
echo "---------------------------------------------------------";
echo "All bwa alignment tasks completed successfully!";
echo "`date`";
echo "---------------------------------------------------------";
exit 0;