#!/usr/bin/perl -X
use strict;
use warnings;
use File::Path qw(make_path);
use Getopt::Long;
use Bio::DB::GenBank;
use Bio::Phylo::Util::Logger ':levels';

# purpose: this script downloads the reference genome
# https://www.ncbi.nlm.nih.gov/nuccore/1798174254
# and splits it into its different coding regions and 
# corresponding amino acid translations. each of these
# sequences is written to a separate fasta file named
# after either the gene, or in the case of ORF1ab into
# its constituent mat_peptide products.

my $verbosity = INFO;
my $accession = 'NC_045512';
my $datadir   = $ENV{'DATADIR'};
GetOptions(
	'accession=s' => \$accession,
	'datadir=s'   => \$datadir,
	'verbose+'    => \$verbosity,
);

# instantiate services and object
Bio::Phylo::Util::Logger->new(
	'-level' => $verbosity,
	'-class' => 'main',
);

INFO "instantiating GenBank client object";
my $gb = Bio::DB::GenBank->new();

INFO "fetching sequence $accession";
my $seq = $gb->get_Seq_by_acc($accession);

INFO "going to extract features";
my %ORF1ab;
FEAT: for my $feat ( $seq->get_SeqFeatures ) {

	# get the product name
	my $product; 
	if ( $feat->has_tag('product') ) {
		($product) = $feat->get_tag_values('product');
		DEBUG "product=$product";
	}
	
	# get the gene name
	my $gene;
	if ( $feat->has_tag('gene') ) {	
		($gene) = $feat->get_tag_values('gene');
		DEBUG "gene=$gene";
	}
	
	# get the note field
	my $note;
	if ( $feat->has_tag('note') ) {
		($note) = $feat->get_tag_values('note');
		DEBUG "note=$note";
	}
	
	# get coordinates and seq
	my $start = $feat->start;
	my $end   = $feat->end;	
	my $rseq  = $feat->spliced_seq->seq;
	
	# pseudo-object relative to which we compute the sequences within the polyprotein
	if ( $feat->primary_tag eq 'CDS' and $gene eq 'ORF1ab' and $note =~ /^pp1ab;/ ) {
		$ORF1ab{'seq'}   = $rseq;
		($ORF1ab{'trans'}) = $feat->get_tag_values('translation');
	}	

	# the CDS that are not part of the ORF1ab
	if ( $feat->primary_tag eq 'CDS' and $gene ne 'ORF1ab' ) {
		my ($aaseq) = $feat->get_tag_values('translation');
		
		# write the extracted nucleotide and aa sequence
		seqwriter(
			'type' => 'rna',
			'name' => $gene,
			'seq'  => $rseq,
		);
		seqwriter(
			'type' => 'aa',
			'name' => $gene,
			'seq'  => $aaseq,
		);
	}
	elsif ( $feat->primary_tag eq 'mat_peptide' and $gene eq 'ORF1ab' ) {
				
		# write the extracted nucleotide and aa sequence
		seqwriter(
			'type' => 'rna',
			'name' => $product,
			'seq'  => $rseq,
		);
		
		# the logic is as follows:
		# 1. we find the index occurrence of the spliced sequence inside ORF1ab / rna
		# 2. we compute the length of the spliced seq
		# 3. divide these by 3, which should be the locations inside ORF1ab / aa
		my $lindex = ( index $ORF1ab{'seq'}, $rseq ) / 3;
		my $length = length($rseq) / 3;		
		if ( int($lindex)!=$lindex or int($length)!=$length ) {
			ERROR "invalid coordinates in $product: $lindex, $length";
		}
		else {
			my $aaseq = substr $ORF1ab{'trans'}, $lindex, $length;
			seqwriter( 
				'type' => 'aa',
				'name' => $product,
				'seq'  => $aaseq,
			);	
		}	
	}
}

# type=(rna|aa), name=($gene|$product), seq=raw
sub seqwriter {
	my %args = @_;
	my ( $type, $name, $seq ) = @args{qw(type name seq)};
	$name =~ s/ /_/g;
	my $dir = "${datadir}/genes/${name}";
	make_path($dir) if not -d $dir;
	my $filename = "${dir}/${type}.fasta";
	
	INFO "going to write ${type} seq for ${name} to ${filename}";
	open my $fh, '>', $filename;
	print $fh ">$accession\n", $seq;
	close $fh;	
}