Page Comparison

...

Step 1: Evaluate and manipulate raw data

Code Block

title	Count and Fastqc

cd quality_exercise
 
head data/Sample1_R1.fastq 

wc -l data/Sample1_R1.fastq 
grep -c '^@HWI' data/Sample1_R1.fastq 

module spider fastqc
module load fastqc
fastqc -h
#DONT RUN ON HEAD NODE:
fastqc data/Sample1_R1.fastq

Look at Fastqc results:

Ideal dataset: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/good_sequence_short_fastqc.html

Our dataset: http://web.corral.tacc.utexas.edu/BioITeam/rnaseq_course/fastqc_exercise/Sample1_R1_fastqc/fastqc_report.html

Code Block

title	manipulate fastq files- quality trimming and filtering

module spider fastx
module load fastx_toolkit
#DONT RUN ON HEAD NODE:
fastx_trimmer -i data/Sample1_R1.fastq -l 90 -Q 33 -o Sample1_R1.trimmed.fastq
#DONT RUN ON HEAD NODE:
fastq_quality_filter -q 20 -p 80 -i data/Sample1_R1.fastq -Q 33 -o Sample1_R1.filtered.fastq


grep -c '^@HWI' results/Sample1_R1.trimmed.fastq 
grep -c '^@HWI' results/Sample1_R1.filtered.fastq

Step 2: Map Raw Data to Reference, Assess results

Code Block

title	Map with BWA

cd ..
cd mapping_exercise
module spider bwa
module load bwa/0.7.12


#DONT RUN ON HEAD NODE:
bwa index -a bwtsw reference/transcripts.fasta

bwa mem

#DONT RUN ON HEAD NODE:
bwa mem ../reference/transcripts.fasta ../data/GSM794483_C1_R1_1.fq data/GSM794483_C1_R1_2.fq > C1_R1.mem.sam

Code Block

title	Map with Tophat

module spider hisat

#DONT RUN ON HEAD NODE
hisat2-build reference/genome.fa reference/genome.fa

 
#DONT RUN ON HEAD NODE:
hisat2 -x ../reference/genome.fa -1 ../data/GSM794483_C1_R1_1.fq -2 ../data/GSM794483_C1_R1_2.fq -S GSM794483_C1.sam --phred33 --novel-splicesite-outfile GSM794483_C1.junctions --rna-strandness RF --dta -t

Code Block

title	Assess Mapping Results

module load samtools

#SYNTAX:
samtools view -b -S samfile > bamfile
samtools sort bamfile sortedbamfile
samtools index sortedbamfile


#BWA RESULTS
samtools flagstat bwa_mem_results_transcriptome/C1_R1.mem.bam
samtools idxstats bwa_mem_results_transcriptome/C1_R1.mem.bam


#How is hisat2 result different from bwa
head -20 hisat_exercise/results/GSM794483_C1.sam
grep -v '^@' bwa_exercise/bwa_mem_results_transcriptome/C1_R1.mem.sam|head|cut -f 6
grep -v '^@' hisat_exercise/results/GSM794483_C1.sam|head|cut -f 6


#DONT RUN ON HEAD NODE
grep -v '^@' hisat_exercise/results/GSM794483_C1.sam|cut -f 6|wc -l

Step 3 and Step 4 : Quantify transcripts and ID DEGs

Code Block

title	Look at bedtools and DESeq results

cd deg_exercise/
 
module swap intel gcc/4.7.1
module load bedtools
bedtools
bedtools multicov
#DONT RUN ON HEAD NODE
bedtools multicov -bams hisat_results/C1_R1.sorted.bam hisat_results/C1_R2.sorted.bam hisat_results/C1_R3.sorted.bam hisat_results/C2_R1.sorted.bam hisat_results/C2_R2.sorted.bam hisat_results/C2_R3.sorted.bam -bed ../reference/genes.formatted.gtf > gene_counts.gff


#What's the difference between bedtools and htseq results?
 
#First look at the annotation file
head ../mapping_exercise/reference/genes.formatted.gtf 

#bedtools result 
head gene_counts.gff 
wc -l gene_counts.gff
 
#htseq result
head gene_counts.htseq.gff 
wc -l gene_counts.htseq.gff
 
#DESEQ2 result:
head deseq_results/DESeq-C1-vs-C2.csv 
 
#Find the top 10 upregulated genes
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|sort -n -r -k3k7,37|cut -f 12,37|head
#If there's an idiosyncracy with sort 
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csvsort -n -r -k3k7,37|grep -v 'e-0'|cut -f 12,37|head
 
#Find the top 10 downregulated genes
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|sort -n -k3k7,37|cut -f 12,37|head
#If there's an idiosyncracy with sort 
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|sort -n -k3k7,37|grep -v 'e-0'|cut -f 12,37|head
 
#Count the DEGs
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|awk '{if ((($3>$7>=1)||($3<$7<=-1))&&($6<$9<=0.05)) print $1,$3,$6}'|wc -l

Versions Compared

Old Version 24

New Version 25

Key

Step 1: Evaluate and manipulate raw data