Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Does this really work?

This fake reference has 4 RAD fragments, 3 with 100-300 bp insert sizes and 1 more with a 350 bp insert size for NlaIII and MluCI

...

Code Block
titlesolution
collapsetrue
#download the reference from NCBI
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/222/465/GCF_000222465.1_Adig_1.1/GCF_000222465.1_Adig_1.1_genomic.fna.gz
  
#decompress it
gunzip GCF_000222465.1_Adig_1.1_genomic.fna.gz
 
#check the command on some different max insert sizes
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,400}CCGG" | wc -l
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,500}CCGG" | wc -l
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,600}CCGG" | wc -l
  
#This can be done in a loop:
 
 
#set up the sizes you want to check
seq 400 200 800 > maxInserts.txt
cat maxInserts.txt
 
 
#build a header for your output
echo -e "insert_range\tNsites" > prediction_results.txt
 
 
#loop through your max insert sizes to get predictions for the number of sites
while read max
 do echo "calculating for 300-${max}..."
 nSites=$(cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,${max}}CCGG" | wc -l)
 echo -e "300-${max}\t${nSites}" >> prediction_results.txt
done < maxInserts.txt
  
#look at the results
cat prediction_results.txt
  
#These values are somewhat low
#What about a four-nucleotide restriction enzyme?
#for NlaIII
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "CATG.{300,400}CCGG" | wc -l