#!/usr/bin/perl

#################################################################
#Postscript Heat Map Generator from RDP output
#################################################################
#Tarjinder Singh (ts4@williams.edu)
#Williams College '12
#################################################################
#Last Updated: Feb. 5, 2009
#################################################################

use strict;
use warnings;
use File::Spec;

#################################################################
#Initial Setup
#################################################################

#directory where raw RDP files are stored
my $file_dir = "rawRDP/";
my $index = "heat_map";
my $conf_score = 0.8;
my $header = "overview_genus";

#hash table containing DNA sequences as well as protein sequences 
my %organism_list;

#total organisms in each file, matched with @file_list
my @total_organism;

my @path = File::Spec->splitdir(File::Spec->rel2abs($0));
my $abs_path = File::Spec->catfile(@path[0..$#path - 1], "");

#################################################################
#Main
#################################################################

#Grab filenames from rawRDP
my @file_list = glob($abs_path . $file_dir . "*.rdp*");

#Read every file using read_raw sub
foreach my $i (0..$#file_list)
{
    read_raw($i);
}

#Generate Excel File
output_xls();

#Generate Postscript Output
ps_output();

#end of main
#################################################################
#Procedures
#################################################################
#################################################################
#Read_raw 
#################################################################
#Interprets data from a single .rdp file and stores into a hash 
#table
sub read_raw
{
    my $file_num = shift(@_);
    my $raw_handle;
    my $temp_line;
    my @split_name;
    
    #read file
    open($raw_handle, '<' , $file_list[$file_num])
	or die "Error in reading input file: $file_list[$file_num]: $!";

    #for every line in the file, read
    while ($temp_line = <$raw_handle>)
    {
	chomp $temp_line;
	
	#filter for unimportant lines
	if ($temp_line =~ /^>/)
	{
	    next;
	}
	
	#read important lines, checking confidence score
	while ($temp_line =~ m/(\d+\.\d+)/g)
	{
	    if ($1 < $conf_score)
	    {
		$temp_line = "";
		
	    }
	}   
	
	#format data and store into a hash table
	if ($temp_line ne "")
	{
	    $temp_line =~ s/(;|root; \d+\.\d+|\s)//gi;
	    $temp_line = join('_',split(/\d+\.\d+/, $temp_line));
	    $organism_list{$temp_line}[$file_num]++;
	    $total_organism[$file_num]++;
	}
    
    }				    
    close ($raw_handle); 
}

#################################################################
#Output_xls 
#################################################################
#outputs data into an excel file
sub output_xls
{
    my $file_name = $header . ".xls";
    my $xls_handle;
    #create excel file
    open($xls_handle, '>' , $abs_path . $file_name)
	or die "Error in reading input file: $file_name: $!";
    
    my @temp;
    print $xls_handle "\t";
    
    #output a list of filenames as column headers
    foreach my $file (@file_list)
    {
	$file =~ /\/(\w*)\./; 
	print $xls_handle $1 . "_genus\t";
    }
    
    print $xls_handle "\n";
    
    #output the data for each organism, row after row
    foreach my $organism (sort keys %organism_list)
    {
	@temp = split('_', $organism);
	print $xls_handle $temp[$#temp] . "\t";

	#output organism frequency for each file in order
	foreach my $file_num (0..$#file_list)
	{
	    if (!defined($organism_list{$organism}[$file_num]))
	    {
		$organism_list{$organism}[$file_num] = 0;
	    }
	    $organism_list{$organism}[$file_num] /= $total_organism[$file_num];
	    printf $xls_handle "%3.2f" . "\t", $organism_list{$organism}[$file_num];
	    
	}
	print $xls_handle "\n";
    }
    
    close ($xls_handle);
}

#################################################################
#PS_output 
#################################################################
#Generates heat map as a Postscript file
sub ps_output
{
    #############################################################
    ##Defining File Output Procedures
    #############################################################
    #name of PS file
    my $file_name = $index . ".ps";

    #initialize values
    my $ps_handle;
    
    open($ps_handle, '>' , $abs_path . $file_name)
	or die "Error in reading input file: $file_name: $!";

    ##########################################################
    ##PS Page Setup
    ##########################################################
    #Define PS file
    print $ps_handle "%!\n\n";

    #Conversion (Points to Inches)
    my $itop = 72; 

    #Page Definition (inches)
    my $hor_len = 8.5;
    my $ver_len = 11;
    
    #Margins of the Page (inches)
    my $LM = 0.2;     #left margin
    my $RM = 0.5;     #right margin
    my $UM = 1;       #upper margin
    my $BM = 0.5;     #lower margin

    ###########################################################
    #Boundaries (points)
    ##Page divided up into two halves
    ##1. heat map    2. name output
    ###########################################################
    my $scale = 0.55;  #heat map given 60% of space
    my $heat_length = ($hor_len - $LM - $RM) * $scale;
    my $name_length = ($hor_len - $LM - $RM) * (1-$scale);
    
    ###########################################################
    #Page Boundaries (points)
    ###########################################################
    ## 1 inch page boundary
    my $upper_limit = ($ver_len - $UM) * $itop;
    my $lower_limit = $BM * $itop;
    my $left_limit = $LM * $itop;
    my $right_limit = ($hor_len - $RM) * $itop;
    my $heat_limit = ($heat_length + $LM) * $itop;

    ###########################################################
    #Box Properties (points)
    ###########################################################
    my $b_height = 11;
    my $b_width = $heat_length * $itop / ($#file_list + 1);
    
    ##########################################################
    ##Printing Frequency Boxes
    ##########################################################
    #Define Starting Coordinates and Color Scheme
    my $x = $left_limit;
    my $y = $upper_limit - $b_height;
    my ($r, $g, $b) = (0, 0, 0);
    my $range_u = 0.5;
    my $range_l = 0.1; 
    
    #PS_box procedure, requires degree of colour (R,G,B) and location (x,y)
    print $ps_handle "/box {newpath \nmoveto \n0 $b_height rlineto \n";
    print $ps_handle "$b_width 0 rlineto \n0 -$b_height rlineto \n-$b_width 0 rlineto\n";
    print $ps_handle "closepath \n";
    print $ps_handle "gsave\n";
    print $ps_handle "setrgbcolor\n";
    print $ps_handle "fill\n"; 
    print $ps_handle "grestore} def\n";
    
    #Loop to go through each organism and file, outputting a box
    foreach my $organism (sort keys %organism_list)
    {
	foreach my $file_num (0..$#file_list)
	{
	    #USE ONLY IF EXCEL FILE NOT GENERATED FIRST
            #if (!defined($organism_list{$organism}[$file_num]))
	    #{
	    #    $organism_list{$organism}[$file_num] = 0;
	    #}
	    
	    #$organism_list{$organism}[$file_num] 
	    #    = $organism_list{$organism}[$file_num]/$total_organism[$file_num];
	    
	    #Define Color Scheme for that Frequency
	    if ($organism_list{$organism}[$file_num] > $range_l)
	    {
		$r = 0.5 * ($organism_list{$organism}[$file_num] - $range_l) / 
		    ($range_u - $range_l);
		$b = 0.5 - $r;
		$g = 0.25 - 0.25 *  ($organism_list{$organism}[$file_num] - $range_l) / 
		    ($range_u - $range_l); 
	    }
	    elsif ($organism_list{$organism}[$file_num] > 0)
	    {
		$r = 1 - $organism_list{$organism}[$file_num] / $range_l;
		$b = 0.5 + 0.5 * $r;
		$g = 0.25 + 0.75 * $r; 
	    }
	    else
	    {
		$r = 0.9;
		$b = 0.9;
		$g = 0.9;
	    }
	    
            #Print the Box
	    print $ps_handle "$r $g $b $x $y box\n";
	    #Shift Over
	    $x += $b_width;
	}
	#Changing the coordinates
	$x = $left_limit;
	$y -= $b_height;
    }
    #Variables for later use
    my @temp; 
    my $final_y = $y;
    ####################################################################
    #Set Font
    ####################################################################
    my $font = 9;
    print $ps_handle "/Arial findfont $font  scalefont setfont\n";
    
    #Defining coordinates for text output
    $x = $heat_limit + $b_height/3;
    $y = $upper_limit - $b_height + ($b_height - $font)/2;
    my $length = 0;
    my $temp_key;
    
    #outputting the genus of each organism
    foreach my $organism (sort keys %organism_list)
    {
	#modify name using Regex
	@temp = split('_', $organism);
	if ($temp[$#temp] =~ /.[A-Z]/)
	{
	    $temp[$#temp] =~ s/([A-Z][a-z]*)//;
	    $temp[0] = $1 . " ";
	    $temp[$#temp] =~ s/([A-Z])[a-z]*/$1\./g;
	    $temp[$#temp] = $temp[0] . $temp[$#temp];
	}
	
	#Ensuring there is enough room to output
	if (length($temp[$#temp]) > $length)
	{
	    $length = length($temp[$#temp]);
	    $temp_key = $temp[$#temp];
	}
	#Output name and shift coordinates
	print $ps_handle "$x $y moveto ($temp[$#temp]) show\n";
	$y-= $b_height;
    }   
    
    ####################################################################
    #Shift Over
    ####################################################################
    #Preparing for second row output
    @temp = split('_', $temp_key);
    print $ps_handle "/x ($temp[$#temp]) stringwidth pop"
	             . " $x add 2 add def\n";
    
    ####################################################################
    #Print Line (requires final coordinates, then initial)
    ####################################################################
    print $ps_handle "/line {newpath moveto lineto stroke} def\n";
    
    ####################################################################
    #Second Identifier
    ###################################################################
    $length = 0;
    my $pos = 0;
    my $count = 0;
    my $common_name = ""; 
    
    $organism_list{zz_zz_zz_zz_zz_zz} = 0;
    #output only when the organisms do not share the second classification
    foreach my $organism (sort keys %organism_list)
    {
	@temp = split('_', $organism);
	
	#store the length of the longest word
	if (length($temp[$#temp-1]) > $length)
	{
	    $length = length($temp[$#temp-1]);
	    $temp_key = $temp[$#temp-1];
	}

	#if count is 0 (first one in the list)
	if ($count == 0)
	{
	    $count++;
	    $common_name = $temp[$#temp-1];
	    $pos++;
	    next;
	}
	
	#if names are the same
	if ($common_name eq $temp[$#temp-1])
	{
	    $count++;
	    $pos++;
	}
	else
	{
	    #outputs second column of words and draws a line
	    if ($count > 1)
	    {
		$x = $upper_limit - $b_height * ($pos - $count + 0.25);
		$y = $upper_limit - $b_height * ($pos - 0.25);
		print $ps_handle "x $y x $x line\n";
		$y = ($y + $x)/2 - $font/2;
		print $ps_handle "x 2 add $y moveto($common_name) show\n";
	    }
	    else
	    {
		$y = $upper_limit - $b_height * $pos + ($b_height-$font)/2;
		print $ps_handle "x $y moveto ($common_name) show\n";
            }
            
	    $count = 1;
	    $common_name = $temp[$#temp-1];
	    $pos++;
	}
    }   
    
    ####################################################################
    #Shift Over
    ####################################################################
    @temp = split('_', $temp_key);
    print $ps_handle "/x ($temp[$#temp-1]) stringwidth pop"
	             . " x add 2 add def\n";
    ####################################################################
    #Third Identifier
    ###################################################################
    #refer to Second Identifier for comments (similar algorithm)
    $pos = 0;
    $count = 0;
    $common_name = ""; 
    foreach my $organism (sort keys %organism_list)
    {
	@temp = split('_', $organism);
	
	if ($count == 0)
	{
	    $count++;
	    $common_name = $temp[$#temp-4];
	    $pos++;
	    next;
	}
	
	if ($common_name eq $temp[$#temp-4])
	{
	    $count++;
	    $pos++;
	}
	else
	{
	    if ($count > 1)
	    {
		$x = $upper_limit - $b_height * ($pos - $count + 0.25);
		$y = $upper_limit - $b_height * ($pos - 0.25);
		print $ps_handle "x $y x $x line\n";
		$y = ($y + $x)/2 - $font/2;
		print $ps_handle "x 2 add $y moveto($common_name) show\n";
	    }
	    else
	    {
		$y = $upper_limit - $b_height * $pos + ($b_height-$font)/2;
		print $ps_handle "x $y moveto ($common_name) show\n";
            }
            
	    $count = 1;
	    $common_name = $temp[$#temp-4];
	    $pos++;
	}
    }   
    
    delete($organism_list{zz_zz_zz_zz_zz_zz});
    
    #################################################################
    #Gradient Generation
    #################################################################
    #height of the gradient
    $b_height = 10; 
    
    #Initial Starting Coordinates, $heat_limit describes the end limit 
    #of the x-axis
    $x = $left_limit; 
    $y = ($ver_len - 0.2) * $itop;
    
    #Defining the box procedure
    print $ps_handle "/box {newpath \nmoveto \n0 $b_height rlineto \n";
    print $ps_handle "$b_height 0 rlineto \n0 -$b_height rlineto \n-$b_height 0 rlineto\n";
    print $ps_handle "closepath \n";
    print $ps_handle "gsave\n";
    print $ps_handle "setrgbcolor\n";
    print $ps_handle "fill\n"; 
    print $ps_handle "grestore} def\n";
    
    #Draw the 0 box
    print $ps_handle "0.9 0.9 0.9 $x $y $b_height sub box\n";
    
    ##################################################################
    #Starting gradient generation
    ##################################################################
    #$initial -> initial point
    #$final_l -> lower ending point
    #$heat_limit -> upper ending point
    #$range_l -> percentage for lower portion
    #$range_u -> percentage for higher portion
    #$res -> thickness of the line

    my $initial = $left_limit + $b_height;
    my $final_l = $initial + $range_l/$range_u * ($heat_limit-$initial);
    my $res = 0.5;
    
    $x = $res + $initial;
    print $ps_handle "1 setlinewidth\n";
    
    #outputs a line with the appropriate color to generate gradient
    while ($x <= $final_l)
    {
	$r =  1 - ($x - $initial)/($final_l - $initial);
	$b = 0.5 + 0.5 * $r;
	$g = 0.25 + 0.75 * $r;
	print $ps_handle 
	    "$r $g $b setrgbcolor $x $y $b_height sub $x $y line\n";
        $x += $res; 
    }
    
    while ($x <= $heat_limit)
    {
	$r =  0.5 * ($x - $final_l)/($heat_limit - $final_l);
	$b = 0.5 - $r;
	$g = 0.25 - 0.25 * ($x - $final_l)/($heat_limit - $final_l);
	print $ps_handle 
	    "$r $g $b setrgbcolor $x $y $b_height sub $x $y line\n";
        $x += $res; 
    }
    
    #Generating the labels for the gradient, setting font and coordinates
    $font = 8;
    $x = $left_limit + $b_height/2;
    $y = ($ver_len - 0.2) * $itop - $b_height - $font - 2;
    print $ps_handle "0 0 0 setrgbcolor\n";
    print $ps_handle "/Arial findfont $font scalefont setfont\n";    
    print $ps_handle "/x (0) stringwidth pop $x exch 2 div sub def\n";
    print $ps_handle "x $y moveto\n";
    print $ps_handle "(0) show\n";
    my $temp;
    
    #Outputing a intervals of 5, can be changed
    my $loop = int($range_u/0.05);
    
    foreach my $i (1..$loop)
    {
	$temp = $i * 5;
	$x = $initial + ($heat_limit - $initial)/$loop * $i;
	print $ps_handle "/x ($temp) stringwidth pop $x exch 2 div sub def\n";
	print $ps_handle "x $y moveto\n";
	print $ps_handle "($temp) show\n";
    }
    #######################################################################
    ##Sequence number per File
    #######################################################################
    #Set up font, type, color
    $font = $b_width - 3;
    print $ps_handle "/Arial findfont $font scalefont setfont\n";
    print $ps_handle "0 0 0 setrgbcolor\n";
    
    #Set up coordinates
    $x = $left_limit - $b_width/2 + $font/2;
    $y = $final_y + $b_height;
    
    #Use for centering
    print $ps_handle "/y (# seqs) stringwidth pop $y exch sub def\n";
    print $ps_handle "gsave\n";
    print $ps_handle "$x y moveto\n";
    print $ps_handle "90 rotate\n";
    print $ps_handle "(# seqs) show\n";
    print $ps_handle "grestore\n";
    
    #For each file, output number of sequences
    foreach my $i (@total_organism)
    {
	 $x += $b_width;
	 print $ps_handle "/y ($i) stringwidth pop $y exch sub def\n";
	 print $ps_handle "gsave\n";
	 print $ps_handle "$x y moveto\n";
         print $ps_handle "90 rotate\n";
         print $ps_handle "($i) show\n";
         print $ps_handle "grestore\n";
    }
    close ($ps_handle);
}

############################################################################
############################################################################
