...
Step 1: Evaluate and manipulate raw data
Code Block |
---|
|
cd quality_exercise
head data/Sample1_R1.fastq
wc -l data/Sample1_R1.fastq
grep -c '^@HWI' data/Sample1_R1.fastq
module spider fastqc
module load fastqc
fastqc -h
#DONT RUN ON HEAD NODE:
fastqc data/Sample1_R1.fastq |
Look at Fastqc results:
Ideal dataset: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/good_sequence_short_fastqc.html
Our dataset: http://web.corral.tacc.utexas.edu/BioITeam/rnaseq_course/fastqc_exercise/Sample1_R1_fastqc/fastqc_report.html
Code Block |
---|
title | manipulate fastq files- quality trimming and filtering |
---|
|
module spider fastx
module load fastx_toolkit
#DONT RUN ON HEAD NODE:
fastx_trimmer -i data/Sample1_R1.fastq -l 90 -Q 33 -o Sample1_R1.trimmed.fastq
#DONT RUN ON HEAD NODE:
fastq_quality_filter -q 20 -p 80 -i data/Sample1_R1.fastq -Q 33 -o Sample1_R1.filtered.fastq
grep -c '^@HWI' results/Sample1_R1.trimmed.fastq
grep -c '^@HWI' results/Sample1_R1.filtered.fastq |
Step 2: Map Raw Data to Reference, Assess results
Code Block |
---|
|
cd ..
cd mapping_exercise
module spider bwa
module load bwa/0.7.12
#DONT RUN ON HEAD NODE:
bwa index -a bwtsw reference/transcripts.fasta
bwa mem
#DONT RUN ON HEAD NODE:
bwa mem ../reference/transcripts.fasta ../data/GSM794483_C1_R1_1.fq data/GSM794483_C1_R1_2.fq > C1_R1.mem.sam
|
Code Block |
---|
|
module spider hisat
#DONT RUN ON HEAD NODE
hisat2-build reference/genome.fa reference/genome.fa
#DONT RUN ON HEAD NODE:
hisat2 -x ../reference/genome.fa -1 ../data/GSM794483_C1_R1_1.fq -2 ../data/GSM794483_C1_R1_2.fq -S GSM794483_C1.sam --phred33 --novel-splicesite-outfile GSM794483_C1.junctions --rna-strandness RF --dta -t |
Code Block |
---|
title | Assess Mapping Results |
---|
|
module load samtools
#SYNTAX:
samtools view -b -S samfile > bamfile
samtools sort bamfile sortedbamfile
samtools index sortedbamfile
#BWA RESULTS
samtools flagstat bwa_mem_results_transcriptome/C1_R1.mem.bam
samtools idxstats bwa_mem_results_transcriptome/C1_R1.mem.bam
#How is hisat2 result different from bwa
head -20 hisat_exercise/results/GSM794483_C1.sam
grep -v '^@' bwa_exercise/bwa_mem_results_transcriptome/C1_R1.mem.sam|head|cut -f 6
grep -v '^@' hisat_exercise/results/GSM794483_C1.sam|head|cut -f 6
#DONT RUN ON HEAD NODE
grep -v '^@' hisat_exercise/results/GSM794483_C1.sam|cut -f 6|wc -l |
Step 3 and Step 4 : Quantify transcripts and ID DEGs
Code Block |
---|
title | Look at bedtools and DESeq results |
---|
|
cd deg_exercise/
module swap intel gcc/4.7.1
module load bedtools
bedtools
bedtools multicov
#DONT RUN ON HEAD NODE
bedtools multicov -bams hisat_results/C1_R1.sorted.bam hisat_results/C1_R2.sorted.bam hisat_results/C1_R3.sorted.bam hisat_results/C2_R1.sorted.bam hisat_results/C2_R2.sorted.bam hisat_results/C2_R3.sorted.bam -bed ../reference/genes.formatted.gtf > gene_counts.gff
#What's the difference between bedtools and htseq results?
#First look at the annotation file
head ../mapping_exercise/reference/genes.formatted.gtf
#bedtools result
head gene_counts.gff
wc -l gene_counts.gff
#htseq result
head gene_counts.htseq.gff
wc -l gene_counts.htseq.gff
#DESEQ2 result:
head deseq_results/DESeq-C1-vs-C2.csv
#Find the top 10 upregulated genes
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|sort -n -r -k3k7,37|cut -f 12,37|head
#If there's an idiosyncracy with sort
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csvsort -n -r -k3k7,37|grep -v 'e-0'|cut -f 12,37|head
#Find the top 10 downregulated genes
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|sort -n -k3k7,37|cut -f 12,37|head
#If there's an idiosyncracy with sort
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|sort -n -k3k7,37|grep -v 'e-0'|cut -f 12,37|head
#Count the DEGs
sed 's/,/\t/g' deseq_results/DESeq-C1-vs-C2.csv|awk '{if ((($3>$7>=1)||($3<$7<=-1))&&($6<$9<=0.05)) print $1,$3,$6}'|wc -l |