...
| Code Block |
|---|
|
cds; cd core_ngs/bedtools_coverage
bedtools coverage -s -d -a merged.good.sc_genes.bed -b yeast_mrna.sort.filt.bam > yeast_mrna.gene_coverage.txt
wc -l yeast_mrna.gene_coverage.txt # 8,829,317 lines! |
It will complain a bit because our genes file includes the yeast plasmid "2-micron" but the RNAseq BAM doesn't include that contig. We'll ignore that warning.
...
| Code Block |
|---|
chrI 7234 9016 YAL067C 1 - 1 0
chrI 7234 9016 YAL067C 1 - 2 0
chrI 7234 9016 YAL067C 1 - 3 0
chrI 7234 9016 YAL067C 1 - 4 0
chrI 7234 9016 YAL067C 1 - 5 0
chrI 7234 9016 YAL067C 1 - 6 0
chrI 7234 9016 YAL067C 1 - 7 0
chrI 7234 9016 YAL067C 1 - 8 0
chrI 7234 9016 YAL067C 1 - 9 0
chrI 7234 9016 YAL067C 1 - 10 0
chrI 7234 9016 YAL067C 1 - 11 0
chrI 7234 9016 YAL067C 1 - 12 0
chrI 7234 9016 YAL067C 1 - 13 0
chrI 7234 9016 YAL067C 1 - 14 0
chrI 7234 9016 YAL067C 1 - 15 0
chrI 7234 9016 YAL067C 1 - 16 0
chrI 7234 9016 YAL067C 1 - 17 1
chrI 7234 9016 YAL067C 1 - 18 1
chrI 7234 9016 YAL067C 1 - 19 1
chrI 7234 9016 YAL067C 1 - 20 1
chrI 7234 9016 YAL067C 1 - 21 1
chrI 7234 9016 YAL067C 1 - 22 1
chrI 7234 9016 YAL067C 1 - 23 1
chrI 7234 9016 YAL067C 1 - 24 1
chrI 7234 9016 YAL067C 1 - 25 1
chrI 7234 9016 YAL067C 1 - 26 1
chrI 7234 9016 YAL067C 1 - 27 1
chrI 7234 9016 YAL067C 1 - 28 1
chrI 7234 9016 YAL067C 1 - 29 1
chrI 7234 9016 YAL067C 1 - 30 1
chrI 7234 9016 YAL067C 1 - 31 1
chrI 7234 9016 YAL067C 1 - 32 1
chrI 7234 9016 YAL067C 1 - 33 1
chrI 7234 9016 YAL067C 1 - 34 1
chrI 7234 9016 YAL067C 1 - 35 1
chrI 7234 9016 YAL067C 1 - 36 1
chrI 7234 9016 YAL067C 1 - 37 1
chrI 7234 9016 YAL067C 1 - 38 2
chrI 7234 9016 YAL067C 1 - 39 2
chrI 7234 9016 YAL067C 1 - 40 2
chrI 7234 9016 YAL067C 1 - 41 3
chrI 7234 9016 YAL067C 1 - 42 3
chrI 7234 9016 YAL067C 1 - 43 3
chrI 7234 9016 YAL067C 1 - 44 3
chrI 7234 9016 YAL067C 1 - 45 4
chrI 7234 9016 YAL067C 1 - 46 4
chrI 7234 9016 YAL067C 1 - 47 4
chrI 7234 9016 YAL067C 1 - 48 4
chrI 7234 9016 YAL067C 1 - 49 4
chrI 7234 9016 YAL067C 1 - 50 4 |
A proper bedGraph file has only 4 columns: chrom start end value and does not need to include positions with 0 reads, so we'll convert the bedtools coverage output to bedGraph using awk. We re-sort the output so that plus and minus strand positions are adjacent.
| Code Block |
|---|
|
cat yeast_mrna.gene_coverage.txt | awk '
BEGIN{FS=OFS="\t"}
{if ($8>0) {print $1,$2-1+$7,$2+$7,$8}}' | \
sort -k1,1 -k2,2n -k3,3n > yeast_mrna.gene_coverage.almost.bedGraph
wc -l yeast_mrna.gene_coverage.almost.bedGraph # 5,710,186 -- better, but still big |
While we probably could consider this file to have bedGraph format, it's preferable to combine adjacent per-base coordinates with the same count into larger regions, e.g.
| Code Block |
|---|
# per-base counts
chrI 7271 7272 2
chrI 7272 7273 2
chrI 7273 7274 2
chrI 7274 7275 3
chrI 7275 7276 3
chrI 7276 7277 3
chrI 7277 7278 3
# corresponding region counts
chrI 7271 7274 6
chrI 7274 7278 12 |
Here's some awk to do this:
| Code Block |
|---|
|
cat yeast_mrna.gene_coverage.almost.bedGraph | awk '
BEGIN{FS=OFS="\t"; chr=""; start=-1; end=-1; tot=0}
{if (chr != $1) { # new contig; finish previous
if (start > -1) { print chr,start,end,tot }
chr=$1; start=$2; end=$3; tot=$4
} else if ($2==pos || $2==pos+1) { # same or adjacent position
tot = tot + $4; start=$2; end=$3;
} else { # new region on same contig; finish prev
if (start > -1) { print chr,start,end,tot }
chr=$1; start=$2; end=$3; tot=$4
}
}
END{ # finish last
if (start > -1) { print chr,start,end,tot }
}' > yeast_mrna.gene_coverage.bedGraph
wc -l yeast_mrna.gene_coverage.bedGraph # |