#!/bin/bash

#Annotate isoseq3 -> cdhit0.99 assembly
#Blastx transcriptome against entire nr database, but takes ages so split assembly into chunks and run blastx on each chunk in parallel to save time
#Assembly has 85,481 transcripts = 100 chunks of ~855 transcripts
#Use bioawk to splot: 
# 	split_fasta.awk script from: https://gist.github.com/iracooke/a8d4d2d1fbc85bd75d5f5477abff211a#file-split_fasta-awk

#Define environment variables
module load bioawk
cd /flash/RavasiU/Jodi/blastx_isoseq3_cdhit0.99_nr
assembly=/bucket/RavasiU/Jodi/ISOseq/04_isoseq3_clustering_ORFs/cdhit_isoseq3/m64150_201031_012853.polished.hq_redundancy_removed_0.99.fa

cat ${assembly} | bioawk -c fastx -v prefix="isoseq3_hq_cdhit_0.99_split_" -v nrec=855 -f /home/j/jodi-thomas/scripts/split_fasta.awk

#only takes a few secs to run
