#!/usr/bin/perl
use strict;
use warnings;

#####################################################################
# BIOL/CMPU-353
# Fall 2008
# Mon, Dec 1
#
# Program: mutation_stats.pl
#
# Written by: Marc Smith and Jodi Schwarz
# Modified by: Example Student
#
# Additional attributions: Rex Dwyer (code related to %codonMap)
#
# Description:  This program
#   1. reads in six aligned DNA sequences of HSP70 from
#      Celegans, Rat, Drosophila, Mouse, Human, and Nematostella.
#   2. calculates diversity of nucleotides in each codon position
#   3. calculates statistics for each codon position:
#      average, mode, ...
# 
# Lab: This program doesn't account for gaps in reasonable way.
#      Modify this program to improve the statistics it reports.
#
#####################################################################

# Global variables

# file containing six aligned nucleotide seqs of HSP70
#my $alignedNucSeqs = "multifasta_nuc.txt";    
my $alignedNucSeqs = "/home/joschwarz/cs353_public/proj5/multiFASTA_nuc_aln.txt";    

# strings to hold aligned sequences from file
my $seq1;                  
my $seq2;                  
my $seq3;                  
my $seq4;                  
my $seq5;                  
my $seq6;                  

my @div;  # array of diversity measures for each pos in alignment

# Initialize DNA and protein sequences from data files
readInDNASeqs( $alignedNucSeqs );

# Calculate the diversity of nucs at each position
# in 6-sequence alignment
calcPosDiv( );

# Calculate divsersity statistics
calcCodonAverage( );

#====================================================================

#-----------\
# calcPosDiv \
#--------------------------------------------------------------------
# SUMMARY: Calculates the nucleotide diversity at each position
#          of a 6-sequence alignment.
# 
# INPUTS: no parameters, but compares globals $seq1--$seq6
# OUTPUTS: no return value
# SIDE EFFECT: global @div array populated
# 
sub calcPosDiv
{
  # split on the empty pattern; returns individual letters
  my @seq1Array = split ( //, $seq1 ); 
  my @seq2Array = split ( //, $seq2 ); 
  my @seq3Array = split ( //, $seq3 ); 
  my @seq4Array = split ( //, $seq4 ); 
  my @seq5Array = split ( //, $seq5 ); 
  my @seq6Array = split ( //, $seq6 ); 
  
  my $seqLength = length($seq1);
  foreach my $i (0..$seqLength-1)
  {
    # make a string of each nuc at position i in alignment
    my $seqI = $seq1Array[$i] . $seq2Array[$i] . $seq3Array[$i] .
               $seq4Array[$i] . $seq5Array[$i] . $seq6Array[$i];

    # count number of occurrences of each nucleotide
    my $numA = ( $seqI =~ tr/A/A/ );
    my $numC = ( $seqI =~ tr/C/C/ );
    my $numT = ( $seqI =~ tr/T/T/ );
    my $numG = ( $seqI =~ tr/G/G/ );
    my $numGap = ( $seqI =~ tr/-/-/ );

    # count total number of nucleotides at position i
    my $divCnt = 0;
    if ( $numGap > 0 )
    {
	    $divCnt = 0;
    }
    else 
    {
    if ($numA > 0)
    {
      $divCnt++;
    }
    if ($numC > 0)
    {
      $divCnt++;
    }
    if ($numT > 0)
    {
      $divCnt++;
    }
    if ($numG > 0)
    {
    $divCnt++;
    }
    }
    # store diversity count in global diversity array
    $div[$i] = $divCnt;
  }
  
}


#---------------\
# calcCodonStats \
#--------------------------------------------------------------------
# Summary: calculates statistics such as the average diversity
#          by nucleotide position within codons
#
# INPUTS: no parameters, but accesses @div global array
# OUTPUTS: none
# SIDE EFFECT: produces output to the terminal
sub calcCodonAverage
{
  # counters
  my $sumPos0 = 0; # sum of diversity in codons' first position
  my $sumPos1 = 0; # sum of diversity in codons' second position
  my $sumPos2 = 0; # sum of diversity in codons' third position

  my $elems = scalar(@div);  # number of elements in @div
  foreach my $i (0..$elems-1)
  {
  if ( $div[$i] == 0 )
  {
	  $elems--;
  }
  else 
  {
      if ( ($i%3) == 0 )
      {
	 $sumPos0 += $div[$i];
      }
      elsif ( ($i%3) == 1 )
      {
	$sumPos1 += $div[$i];
      }
      else # ($i%3) == 2
      {
	$sumPos2 += $div[$i];
    }
  }
  }
  # calculate average diversity for each codon position
  # my $numCodons = $elems / 3;
  my $numCodons = $elems / 3;
  my $avgDiv0 = $sumPos0 / $numCodons;
  my $avgDiv1 = $sumPos1 / $numCodons;
  my $avgDiv2 = $sumPos2 / $numCodons;

  # print stats
  print "\nAverage diversity by nuceotide position within codons:\n";
  print "\tCodon position 1: $avgDiv0\n";
  print "\tCodon position 2: $avgDiv1\n";
  print "\tCodon position 3: $avgDiv2\n";
  print "\n";
}

#--------------\
# readInDNASeqs \
#------------------------------------------------------------------------------
# SUMMARY: Subroutine to open a multi-FASTA formatted file of DNA sequences
#          and read in the six sequences it contains.
#
# IN:       1 argument:  name of file holding the DNA 
#           (assumed FASTA format that includes 6 >header/seq line pairs)
#
# RETURNS:  nothing
# SIDE EFFECT: global variables seq1-seq6 initialized to seqs from file
#
sub readInDNASeqs
{
  my ($filename) = @_;  # save filename argument in local variable

  my $FNAFILE;  # create a file handle to the opened file

  my $headerLine;  # holds headerline of each sequence (ignored)
  
  open ( $FNAFILE, '<', $filename )
    or die "Cannot open the input file: $filename: $!";

  # read sequence 1 ignoring the header line
  $headerLine = <$FNAFILE>;
  $seq1 = <$FNAFILE>;
  chomp $seq1;
  $seq1 = uc($seq1);

  # read sequence 2 ignoring the header line
  $headerLine = <$FNAFILE>;
  $seq2 = <$FNAFILE>;
  chomp $seq2;
  $seq2 = uc($seq2);

  # read sequence 3 ignoring the header line
  $headerLine = <$FNAFILE>;
  $seq3 = <$FNAFILE>;
  chomp $seq3;
  $seq3 = uc($seq3);

  # read sequence 4 ignoring the header line
  $headerLine = <$FNAFILE>;
  $seq4 = <$FNAFILE>;
  chomp $seq4;
  $seq4 = uc($seq4);

  # read sequence 5 ignoring the header line
  $headerLine = <$FNAFILE>;
  $seq5 = <$FNAFILE>;
  chomp $seq5;
  $seq5 = uc($seq5);

  # read sequence 6 ignoring the header line
  $headerLine = <$FNAFILE>;
  $seq6 = <$FNAFILE>;
  chomp $seq6;
  $seq6 = uc($seq6);

  close( $FNAFILE );
  
} # end subroutine readInDNA
#------------------------------------------------------------------------------