#!/usr/bin/perl
use strict;
use warnings;

#####################################################################
# BIOL/CMPU-353
# Fall 2008
# Project 5
# 
# Assigned: Mon, Nov. 10
# Due:      Mon, Nov. 17
# 
# Written by: Marc Smith and Jodi Schwarz
# Modified by: Example Student
#
# Additional attributions: Rex Dwyer (code related to %codonMap)
#
# Description:
#   This program runs computational experiments to measure how
#   mutating a DNA sequence in different ways results in different
#   similarity measures between original and mutated sequences.
#   
#   Both mutation strategies require mutating 15% of the original
#   DNA sequence, with the following differences:
#   - one strategy mutates nucleotides occurring anywhere in the 
#     sequence
#   - the other strategy targets only nucleotides in the third 
#     position of a codon
# 
# Question:
#   Which strategy will more greatly impact the similarity between 
#   original and mutated protein sequences?
#####################################################################

# Global variables

my %codonMap;                         # hash table: codon->AA

my $TRIALS = 100;                     # no. of experimental trials
my $mutationRate = .15;               # mutation rate

my $nucFile = "hsp70_nuc.fasta.txt";  # files containing
my $aaFile = "hsp70_aa.fasta.txt";    # DNA and protein sequences

my $origDNASeq;                       # strings to hold original
my $origProtSeq;                      # DNA and protein sequences

# Initialize DNA and protein sequences from data files
$origDNASeq = readInDNA( $nucFile );
$origProtSeq = readInDNA( $aaFile );

createCodonMap();  # table to translate DNA->protein strings

my $totalPercentId1 = 0; # keeps running tab of percents for average

# Conduct $TRIALS experiments using strategy 1
print "Results for random mutation strategy\n";
print "Trial, \%identity\n";

foreach my $trial (1..$TRIALS)
{
  my $percentId = strategy1( );
  $totalPercentId1 += $percentId;
  print "$trial, $percentId\n";
}

my $totalPercentId2 = 0; # keeps running tab of percents for average

# Conduct $TRIALS experiments using strategy 2
print "\nResults for targeted mutation strategy\n";
print "Trial, \%identity\n";
foreach my $trial (1..$TRIALS)
{
  my $percentId = strategy2( );
  $totalPercentId2 += $percentId;
  print "$trial, $percentId\n";
}

print "\nAverage percent identity for Strategy One: ".( $totalPercentId1/$TRIALS );
print "\nAverage percent identity for Strategy Two: ".( $totalPercentId2/$TRIALS )."\n\n";



#====================================================================

#----------\
# strategy1 \
#--------------------------------------------------------------------
# SUMMARY: Subroutine to perform a single experiment using strategy 
# 1. Mutates 15% of nucleotides in any random position.
# 
sub strategy1
{
  # split on the empty pattern; returns individual letters
  my @nucleotides = split ( //, $origDNASeq ); 
  
  # build new sequence
  my $newDNASeq = "";

  foreach my $nuc (@nucleotides)
  {
    if (rand(1) < $mutationRate)
	{
      # randomly mutate $nuc to a new nucleotide (A, C, G, T)
      $newDNASeq = $newDNASeq . randNuc($nuc);
    }
    else 
    {
      # append original nucleotide to new sequence
      $newDNASeq = $newDNASeq . $nuc;
    }
  }
  
  # translate new DNA sequence to protein sequence
  my $newProtSeq = translate( $newDNASeq );
  
  # calculate the percent identity 
  my $identity = calcIdentity( $newProtSeq, $origProtSeq );
  
  return $identity;
}

#====================================================================

#----------\
# strategy2 \
#--------------------------------------------------------------------
# SUMMARY: Subroutine to perform a single experiment using strategy 
# 2. Targets all mutations to the third position of the codons
# 
sub strategy2
{
  # split on the empty pattern; returns individual letters
  my @nucleotides = split ( //, $origDNASeq ); 

  # build new sequence
  my $newDNASeq = "";
  my $nuc = 0; 
  
  # set mutation rate equal to .45, which = .15 * 3 because you want
  # the third nucleotide to be mutated a percent of the time so that
  # the entire sequence is still mutated 15% of the time. It is used 
  # during randNuc and is temporary.
  $mutationRate = .45;
  
  # Keeps track of position of the array during iteration
  my $arrayPos = 1;
  
  foreach my $nuc ( @nucleotides )
  {
     # Additional if condition added for strategy 2.  This says that
     # what is contained in the condition will only happen if 
     # the position of the array element happens to be divisible 
     # evenly by three, meaning that during iteration it is at the 
     # third position of the codon.  
     if ( ( rand(1) < $mutationRate ) and ( $arrayPos % 3 == 0 ) )
     {
       # randomly mutate $nuc to a new nucleotide (A, C, G, T)
       $newDNASeq = $newDNASeq . randNuc( $nuc );
     }
     else 
     {
       # append original nucleotide to new sequence
       $newDNASeq = $newDNASeq . $nuc;
     }
    
     $arrayPos++; # increment array position counter
  }
  
  # translate new DNA sequence to protein sequence
  my $newProtSeq = translate( $newDNASeq );
  
  # calculate the percent identity 
  my $identity = calcIdentity( $newProtSeq, $origProtSeq );
  
  # reset mutation rate
  $mutationRate = .15;
  
  return $identity;
}	

#--------\
# randNuc \
#--------------------------------------------------------------------
# Summary: Returns a random nucleotide from A, C, G, or T.
#
sub randNuc
{
  my $nucleotide = shift(@_); #shift off argument
  my $n = int( rand(4) );     # random number between 0 and 3
  my $nuc;                    # to hold return value (A, C, G, or T)
     
  # Condition for setting $nuc: Random number is matched to corresponding
  # value between 0 and 3.  Letters chosen arbitrarily with respect to 
  # numbers
  if ($n == 0) 
  {
    $nuc = "A";
  }
  elsif ($n == 1)
  {
    $nuc = "C";
  }
  elsif ($n == 2)
  {
    $nuc = "G";
  }
  else
  {
    $nuc = "T";
  }
  
  # Before you return a value for $nuc, check to see if it equals the
  # original letter from the string.  If it does, then you must perform
  # the operation again and again until a different random letter is 
  # assigned.  This ensures that a different nucleotide is substituted 
  # as a result of mutation
  if ( $nuc eq $nucleotide )
  {
    randNuc( $nuc );
  }
  else 
  {
    return $nuc;
  }
}


#---------------\
# createCodonMap \
#--------------------------------------------------------------------
# Author: Rex Dwyer
# Summary:
# construct hash that maps codons to amino acids by reading table
# from DATA at the end of the program, which have the following form:
# Residue Codon1 Condon2 ...
#
sub createCodonMap
{
  while (my $in=<DATA>)
  {
    chomp($in);
    my @codons = split " ",$in;
    my $residue = shift @codons;
    foreach my $nnn(@codons) 
    {
      $codonMap{$nnn} = $residue;
    }
  }
}

#----------\
# translate \
#--------------------------------------------------------------------
# Author: Rex Dwyer (modified by Marc Smith)
# Summary: translates DNA strings to proteins
#
sub translate {
	my ($dna) = @_;
	my $pro = "";
	while ($dna =~ s/(...)//) {
		$pro = $pro . $codonMap{$1};
	}
	return $pro;
}


#-------------\
# calcIdentity \
#--------------------------------------------------------------------
# Author: Philip Tully
# Summary: calculates the percent identity of the two given
# sequences.
#
sub calcIdentity
{
  my $newSequence = shift(@_);	# shift off first argument
  my $origSequence = shift(@_);	# shift off second argument
	
  my $numMatches;  # number of matches between original, new sequences
  my @newSeqArray; # declare arrays
  my @origSeqArray;
  
  # allocate each string letter into an indivdual array position
  @newSeqArray = split( //, $newSequence );
  @origSeqArray = split( //, $origSequence );
	
  # iterate through the original sequence array. This may be done because
  # it is known that the lengths of the original sequence and the new 
  # sequence are equivalent.
  foreach my $i ( 0 .. ( $#origSeqArray - 1 ) )
  {
    # condition: if the positional array value of the two 
    # strings are equal
    if ( $newSeqArray[ $i ] eq $origSeqArray[ $i ] )
    {
      $numMatches++; # increment match counter
    }
  }
  
  # the value returned is the percentage identity.  This is equal to
  # the number of matches between the two compared strings, divided by
  # the length of the strings (their lengths are equal), and multiplied 
  # by 100 to get a percentage value out of 100%
  return ( ( $numMatches / length ( $origSequence ) ) * 100 ); 
}


#----------\
# readInDNA \
#------------------------------------------------------------------------------
# SUMMARY: Subroutine to open a FASTA formatted file of DNA sequence
# and return as one long string.  The function will remove any whitespace,
# ignore lines that begin with a #, and any digits that may appear in
# sequence, e.g., line numbers or base pair counts.
#
# IN:       1 argument:  name of file holding the DNA 
#           (assumed FASTA format that includes a >header line)
#
# RETURNS:  DNA as one string in uppercase nucleotide letters
#
sub readInDNA
{
	my ($filename) = @_;	# save filename argument in local variable

	my $firstline;	# holds headerline (ignored)
	my $line;	# holds each subsequent line one at a time

	my $FNAFILE;	# create a file handle to the opened file
	
	open ( $FNAFILE, '<', $filename )
           or die "Cannot open the input file: $filename: $!";
      
    # read and ignore the header line if found (complain if not)   
    if ( !($firstline = <$FNAFILE>) ) 
    {
       print "Can NOT read the header line from the file called: $filename \n";
       exit();
    }
	
	my $seq = "";	# continually concatenate each line to end of $seq
  
  	# WHILE not EOF, grab next line and concatenate sequences together
	while ( $line = <$FNAFILE> )		
	{
			chomp $line;	 # gobble the newline character
			
	    	# discard a blank line
        	if ( $line =~ m/^\s*$/ ) # if whitespace from start to end,
        	{	next; }		 # return to while for next line
        	 
        	# discard comment line
        	if ($line =~ m/^\s*#/) 
        	{   next; } 
		
			$line =~ tr/0123456789//d; # remove all digits
			$line =~ tr/ \t\n\r//d;	   # remove extra whitespace
			$line = uc($line);	   # convert to upper-case
			
    		$seq = $seq . $line; # concatenate new line onto entire text
	
	} # end WHILE not EOF
	
	close( $FNAFILE );
	
	return $seq;
	
} # end subroutine readInDNA
#------------------------------------------------------------------------------

## The lines below are not Perl statements and are not executed as 
## part of the program. Instead, they are available to be read as 
## input by the program using the filehandle DATA.
__END__
A GCT GCC GCA GCG
R CGT CGC CGA CGG AGA AGG
N AAT AAC
D GAT GAC
C TGT TGC
Q CAA CAG
E GAA GAG
G GGT GGC GGA GGG
H CAT CAC
I ATT ATC ATA
L TTA TTG CTT CTC CTA CTG
K AAA AAG
M ATG
F TTT TTC
P CCT CCC CCA CCG
S TCT TCC TCA TCG AGT AGC
T ACT ACC ACA ACG
W TGG
Y TAT TAC
V GTT GTC GTA GTG
. TAA TAG TGA
