#!/bin/bash

#Transdecoder .cds output just gives the sections of each transcript that are cds
#We want to retain the entire transcript that contains ORFs/cds
#Ran this code directly in command line when in: 
#/bucket/RavasiU/Jodi/ISOseq/04_isoseq3_clustering_ORFs/cdhit_isoseq3/isoseq3_cdhit0.99_transdecoder/isoseq3_cdhit0.99_transdecoder_blastpnrmollusca

#Get list of transcript IDs found to contains ORFs, from the transdecoder .cds output
cat *transdecoder.cds | grep ">" | sed 's|\.p.*||' | sed 's|>||' > transcriptID_list.txt

#Use this transcript ID list to filter the assembly.fa only for these transcripts using bioawk
module load bioawk/20210202
bioawk -cfastx 'BEGIN{while((getline k <"transcriptID_list.txt")>0)i[k]=1}{if(i[$name])print ">"$name"\n"$seq}' \
	/bucket/RavasiU/Jodi/ISOseq/04_isoseq3_clustering_ORFs/cdhit_isoseq3/m64150_201031_012853.polished.hq_redundancy_removed_0.99.fa \
	> m64150_201031_012853.polished.hq_redundancy_removed_0.99.fa.transdecoder_genes.cds

