#!/usr/bin/perl
use strict;
use warnings;

#####################################################################
# BIOL/CMPU-353
# Fall 2008
# Project 5
# 
# Assigned: Mon, Nov. 10
# Due:      Mon, Nov. 17
# 
# Written by: Marc Smith and Jodi Schwarz
# Modified by: <your name here>
#
# Additional attributions: Rex Dwyer (code related to %codonMap)
#
# Description:
#   This program runs computational experiments to measure how
#   mutating a DNA sequence in different ways results in different
#   similarity measures between original and mutated sequences.
#   
#   Both mutation strategies require mutating 15% of the original
#   DNA sequence, with the following differences:
#   - one strategy mutates nucleotides occurring anywhere in the 
#     sequence
#   - the other strategy targets only nucleotides in the third 
#     position of a codon
# 
# Question:
#   Which strategy will more greatly impact the similarity between 
#   original and mutated protein sequences?
#####################################################################

# Global variables

my %codonMap;                         # hash table: codon->AA

my $TRIALS = 100;                     # no. of experimental trials
my $mutationRate = .15;               # mutation rate

my $nucFile = "hsp70_nuc.fasta.txt";  # files containing
my $aaFile = "hsp70_aa.fasta.txt";    # DNA and protein sequences

my $origDNASeq;                       # strings to hold original
my $origProtSeq;                      # DNA and protein sequences

# Initialize DNA and protein sequences from data files
$origDNASeq = readInDNA( $nucFile );
$origProtSeq = readInDNA( $aaFile );

createCodonMap();  # table to translate DNA->protein strings

# Conduct $TRIALS experiments using strategy 1
print "Results for random mutation strategy\n";
print "Trial, \%identity\n";
foreach my $trial (1..$TRIALS)
{
  my $percentId = strategy1( );
  print "$trial, $percentId\n";
}
print "\n";

# Conduct $TRIALS experiments using strategy 2
#print "Results for targeted mutation strategy\n";
#print "Trial, \%identity\n";
#foreach my $trial (1..$TRIALS)
#{
#  my $percentId = strategy2( );
#  print "$trial, $percentId\n";
#}


#====================================================================

#----------\
# strategy1 \
#--------------------------------------------------------------------
# SUMMARY: Subroutine to perform a single experiment using strategy 
# 1. Mutates 15% of nucleotides in any random position.
# 
sub strategy1
{
  # split on the empty pattern; returns individual letters
  my @nucleotides = split ( //, $origDNASeq ); 
  
  # build new sequence
  my $newDNASeq = "";
  foreach my $nuc (@nucleotides)
  {
    if (rand(1) < $mutationRate)
	{
      # randomly mutate $nuc to a new nucleotide (A, C, G, T)
      $newDNASeq = $newDNASeq . randNuc();
    }
    else 
    {
      # append original nucleotide to new sequence
      $newDNASeq = $newDNASeq . $nuc;
    }
  }
  
  # translate new DNA sequence to protein sequence
  my $newProtSeq = translate( $newDNASeq );
  
  # calculate the percent identity 
  my $identity = calcIdentity( $newProtSeq, $origProtSeq );
  
  return $identity;
}


#--------\
# randNuc \
#--------------------------------------------------------------------
# Summary: Returns a random nucleotide from A, C, G, or T.
#
sub randNuc
{
  my $n = int( rand(4) );  # random number between 0 and 3
  my $nuc;                 # to hold return value (A, C, G, or T)
  if ($n == 0) 
  {
    $nuc = "A";
  }
  elsif ($n == 1)
  {
    $nuc = "C";
  }
  elsif ($n == 2)
  {
    $nuc = "G";
  }
  else
  {
    $nuc = "T";
  }
  return $nuc;
}


#---------------\
# createCodonMap \
#--------------------------------------------------------------------
# Author: Rex Dwyer
# Summary:
# construct hash that maps codons to amino acids by reading table
# from DATA at the end of the program, which have the following form:
# Residue Codon1 Condon2 ...
#
sub createCodonMap
{
  while (my $in=<DATA>)
  {
    chomp($in);
    my @codons = split " ",$in;
    my $residue = shift @codons;
    foreach my $nnn(@codons) 
    {
      $codonMap{$nnn} = $residue;
    }
  }
}

#----------\
# translate \
#--------------------------------------------------------------------
# Author: Rex Dwyer (modified by Marc Smith)
# Summary: translates DNA strings to proteins
#
sub translate {
	my ($dna) = @_;
	my $pro = "";
	while ($dna =~ s/(...)//) {
		$pro = $pro . $codonMap{$1};
	}
	return $pro;
}


#-------------\
# calcIdentity \
#--------------------------------------------------------------------
# Summary: calculates the percent identity of the two given
# sequences.
#
sub calcIdentity
{
  # you fill in the code for this subroutine
  return 0.0;
  # it shouldn't unconditionally return 0.0...  :-)
}


#----------\
# readInDNA \
#------------------------------------------------------------------------------
# SUMMARY: Subroutine to open a FASTA formatted file of DNA sequence
# and return as one long string.  The function will remove any whitespace,
# ignore lines that begin with a #, and any digits that may appear in
# sequence, e.g., line numbers or base pair counts.
#
# IN:       1 argument:  name of file holding the DNA 
#           (assumed FASTA format that includes a >header line)
#
# RETURNS:  DNA as one string in uppercase nucleotide letters
#
sub readInDNA
{
	my ($filename) = @_;	# save filename argument in local variable

	my $firstline;	# holds headerline (ignored)
	my $line;		# holds each subsequent line one at a time

	my $FNAFILE;	# create a file handle to the opened file
	
	open ( $FNAFILE, '<', $filename )
           or die "Cannot open the input file: $filename: $!";
      
    # read and ignore the header line if found (complain if not)   
    if ( !($firstline = <$FNAFILE>) ) 
    {
       print "Can NOT read the header line from the file called: $filename \n";
       exit();
    }
	
	my $seq = "";	# continually concatenate each line to end of $seq
  
  	# WHILE not EOF, grab next line and concatenate sequences together
	while ( $line = <$FNAFILE> )		
	{
			chomp $line;				# gobble the newline character
			
	    	# discard a blank line
        	if ( $line =~ m/^\s*$/ ) 	# if whitespace (\s) from start(^) to end($), 
        	{	next; }					#    return to while for next line
        	 
        	# discard comment line (any line starting with a comment(#) symbol)
        	if ($line =~ m/^\s*#/) 
        	{   next; } 
		
			$line =~ tr/0123456789//d;	# remove all digits
			$line =~ tr/ \t\n\r//d;		# remove any extra whitespace
			$line = uc($line);			# convert everything to upper-case
			
    		$seq = $seq . $line;		# concatenate new line onto the entire text so far
	
	} # end WHILE not EOF
	
	close( $FNAFILE );
	
	return $seq;
	
} # end subroutine readInDNA
#------------------------------------------------------------------------------

## The lines below are not Perl statements and are not executed as 
## part of the program. Instead, they are available to be read as 
## input by the program using the filehandle DATA.
__END__
A GCT GCC GCA GCG
R CGT CGC CGA CGG AGA AGG
N AAT AAC
D GAT GAC
C TGT TGC
Q CAA CAG
E GAA GAG
G GGT GGC GGA GGG
H CAT CAC
I ATT ATC ATA
L TTA TTG CTT CTC CTA CTG
K AAA AAG
M ATG
F TTT TTC
P CCT CCC CCA CCG
S TCT TCC TCA TCG AGT AGC
T ACT ACC ACA ACG
W TGG
Y TAT TAC
V GTT GTC GTA GTG
. TAA TAG TGA
