#!/usr/bin/perl use strict; use warnings; ##################################################################### # BIOL/CMPU-353 # Fall 2008 # Project 5 # # Assigned: Mon, Nov. 10 # Due: Mon, Nov. 17 # # Written by: Marc Smith and Jodi Schwarz # Modified by: # # Additional attributions: Rex Dwyer (code related to %codonMap) # # Description: # This program runs computational experiments to measure how # mutating a DNA sequence in different ways results in different # similarity measures between original and mutated sequences. # # Both mutation strategies require mutating 15% of the original # DNA sequence, with the following differences: # - one strategy mutates nucleotides occurring anywhere in the # sequence # - the other strategy targets only nucleotides in the third # position of a codon # # Question: # Which strategy will more greatly impact the similarity between # original and mutated protein sequences? ##################################################################### # Global variables my %codonMap; # hash table: codon->AA my $TRIALS = 100; # no. of experimental trials my $mutationRate = .15; # mutation rate my $nucFile = "hsp70_nuc.fasta.txt"; # files containing my $aaFile = "hsp70_aa.fasta.txt"; # DNA and protein sequences my $origDNASeq; # strings to hold original my $origProtSeq; # DNA and protein sequences # Initialize DNA and protein sequences from data files $origDNASeq = readInDNA( $nucFile ); $origProtSeq = readInDNA( $aaFile ); createCodonMap(); # table to translate DNA->protein strings # Conduct $TRIALS experiments using strategy 1 print "Results for random mutation strategy\n"; print "Trial, \%identity\n"; foreach my $trial (1..$TRIALS) { my $percentId = strategy1( ); print "$trial, $percentId\n"; } print "\n"; # Conduct $TRIALS experiments using strategy 2 #print "Results for targeted mutation strategy\n"; #print "Trial, \%identity\n"; #foreach my $trial (1..$TRIALS) #{ # my $percentId = strategy2( ); # print "$trial, $percentId\n"; #} #==================================================================== #----------\ # strategy1 \ #-------------------------------------------------------------------- # SUMMARY: Subroutine to perform a single experiment using strategy # 1. Mutates 15% of nucleotides in any random position. # sub strategy1 { # split on the empty pattern; returns individual letters my @nucleotides = split ( //, $origDNASeq ); # build new sequence my $newDNASeq = ""; foreach my $nuc (@nucleotides) { if (rand(1) < $mutationRate) { # randomly mutate $nuc to a new nucleotide (A, C, G, T) $newDNASeq = $newDNASeq . randNuc(); } else { # append original nucleotide to new sequence $newDNASeq = $newDNASeq . $nuc; } } # translate new DNA sequence to protein sequence my $newProtSeq = translate( $newDNASeq ); # calculate the percent identity my $identity = calcIdentity( $newProtSeq, $origProtSeq ); return $identity; } #--------\ # randNuc \ #-------------------------------------------------------------------- # Summary: Returns a random nucleotide from A, C, G, or T. # sub randNuc { my $n = int( rand(4) ); # random number between 0 and 3 my $nuc; # to hold return value (A, C, G, or T) if ($n == 0) { $nuc = "A"; } elsif ($n == 1) { $nuc = "C"; } elsif ($n == 2) { $nuc = "G"; } else { $nuc = "T"; } return $nuc; } #---------------\ # createCodonMap \ #-------------------------------------------------------------------- # Author: Rex Dwyer # Summary: # construct hash that maps codons to amino acids by reading table # from DATA at the end of the program, which have the following form: # Residue Codon1 Condon2 ... # sub createCodonMap { while (my $in=) { chomp($in); my @codons = split " ",$in; my $residue = shift @codons; foreach my $nnn(@codons) { $codonMap{$nnn} = $residue; } } } #----------\ # translate \ #-------------------------------------------------------------------- # Author: Rex Dwyer (modified by Marc Smith) # Summary: translates DNA strings to proteins # sub translate { my ($dna) = @_; my $pro = ""; while ($dna =~ s/(...)//) { $pro = $pro . $codonMap{$1}; } return $pro; } #-------------\ # calcIdentity \ #-------------------------------------------------------------------- # Summary: calculates the percent identity of the two given # sequences. # sub calcIdentity { # you fill in the code for this subroutine return 0.0; # it shouldn't unconditionally return 0.0... :-) } #----------\ # readInDNA \ #------------------------------------------------------------------------------ # SUMMARY: Subroutine to open a FASTA formatted file of DNA sequence # and return as one long string. The function will remove any whitespace, # ignore lines that begin with a #, and any digits that may appear in # sequence, e.g., line numbers or base pair counts. # # IN: 1 argument: name of file holding the DNA # (assumed FASTA format that includes a >header line) # # RETURNS: DNA as one string in uppercase nucleotide letters # sub readInDNA { my ($filename) = @_; # save filename argument in local variable my $firstline; # holds headerline (ignored) my $line; # holds each subsequent line one at a time my $FNAFILE; # create a file handle to the opened file open ( $FNAFILE, '<', $filename ) or die "Cannot open the input file: $filename: $!"; # read and ignore the header line if found (complain if not) if ( !($firstline = <$FNAFILE>) ) { print "Can NOT read the header line from the file called: $filename \n"; exit(); } my $seq = ""; # continually concatenate each line to end of $seq # WHILE not EOF, grab next line and concatenate sequences together while ( $line = <$FNAFILE> ) { chomp $line; # gobble the newline character # discard a blank line if ( $line =~ m/^\s*$/ ) # if whitespace (\s) from start(^) to end($), { next; } # return to while for next line # discard comment line (any line starting with a comment(#) symbol) if ($line =~ m/^\s*#/) { next; } $line =~ tr/0123456789//d; # remove all digits $line =~ tr/ \t\n\r//d; # remove any extra whitespace $line = uc($line); # convert everything to upper-case $seq = $seq . $line; # concatenate new line onto the entire text so far } # end WHILE not EOF close( $FNAFILE ); return $seq; } # end subroutine readInDNA #------------------------------------------------------------------------------ ## The lines below are not Perl statements and are not executed as ## part of the program. Instead, they are available to be read as ## input by the program using the filehandle DATA. __END__ A GCT GCC GCA GCG R CGT CGC CGA CGG AGA AGG N AAT AAC D GAT GAC C TGT TGC Q CAA CAG E GAA GAG G GGT GGC GGA GGG H CAT CAC I ATT ATC ATA L TTA TTG CTT CTC CTA CTG K AAA AAG M ATG F TTT TTC P CCT CCC CCA CCG S TCT TCC TCA TCG AGT AGC T ACT ACC ACA ACG W TGG Y TAT TAC V GTT GTC GTA GTG . TAA TAG TGA