Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
titlesolution
collapsetrue
#look at the file
less test_reference.fasta


#note this may not work on a Mac, try on TACC if it doesn't
cd in_silico_digestion
cat test_reference.fasta | tr -d "\n" | grep -o -E "CATG.{100,300}AATT" | wc -l
cat test_reference.fasta | tr -d "\n" | grep -o -E "CATG.{100,350}AATT" | wc -l

...

Code Block
titlesolution
collapsetrue
#download the reference from NCBI
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/222/465/GCF_000222465.1_Adig_1.1/GCF_000222465.1_Adig_1.1_genomic.fna.gz
  
#decompress it
gunzip GCF_000222465.1_Adig_1.1_genomic.fna.gz
 
#check the command on some different max insert sizes
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,400}CCGG" | wc -l
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,500}CCGG" | wc -l
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,600}CCGG" | wc -l
  
#This can be done in a loop:
 
 
#set up the sizes you want to check
seq 400 100200 1000800 > maxInserts.txt
lesscat maxInserts.txt
 
 
#build a header for your output
echo -e "insert_range\tNsites" > prediction_results.txt
 
 
#loop through your max insert sizes to get predictions for the number of sites
while read max
 do nSites=$(cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "GCATGC.{300,${max}}CCGG" | wc -l)
 echo -e "300-${max}\t${nSites}" >> prediction_results.txt
done < maxInserts.txt
  
#look at the results
cat prediction_results.txt
  
#These values are somewhat low
#What about a four-nucleotide restriction enzyme?
#for NlaIII
cat GCF_000222465.1_Adig_1.1_genomic.fna | tr -d "\n" | grep -o -E "CATG.{300,400}CCGG" | wc -l