#!/usr/bin/perl -w

use FindBin;
use lib "$FindBin::Bin";
use Alignments;
use Exit;

$nbins  = 5;    # The number of intervals of the discrete prob. distribution.

my $save_cmd_line = join(' ',@ARGV);
$verbose = 1;

while (my $arg=shift(@ARGV))
{
    if ( $arg eq '-d' || $arg eq '--descriptors' ) { $descriptors=shift(@ARGV); next; }
    if ( $arg eq '-t' || $arg eq '--title' ) { $title=shift(@ARGV); next; }
    if ( $arg eq '-a' || $arg eq '--alignment' ) { $alnfile=shift(@ARGV); next; }
    if ( $arg eq '-b' || $arg eq '--bins' ) { $nbins=shift(@ARGV); next; }
    if ( $arg eq '-v' || $arg eq '--verbose' ) { $verbose=1; next; }
    if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { usage(); }
    usage();
}

if ( !$title && $alnfile ) { $title = "Data from file $alnfile" }
elsif ( !$title && !$alnfile ) { $title = 'Data from STDIN' }

if ( !$descriptors ) { usage("Missing the -d option.\n") }

my ($vectors,$nprops,$desc_title,$desc_desc) = read_vectors($descriptors);
my $histogram = create_histogram($alnfile);
my $ranges = create_ranges($vectors,$nprops,$nbins);

if ( !($descriptors=~m{([^/]+)$}) ) { usage("Could not determine basename of \"$descriptors\".\n") }
$desc_file = $1;


print << "EOT";
#
# This file was generated by 'create-vectorlib $save_cmd_line'.
#
package Vectorlib;

\$nprops = $nprops;
\$nbins  = $nbins;
\$title  = '$title';    # Edit manually
\$desc   = '';          # Edit manually

\$descriptors_title = '$desc_title';    # For WebPCPMer create-vectorlibs
\$descriptors_desc  = '$desc_desc';
\$descriptors_file  = '$desc_file'; 

EOT

print "# The property values 1..$nprops for all amino acids.\n";
print "\$vectors = {\n";
for my $property (1..$nprops)
{
    print "\t$property => {\n";
    for my $aa (sort keys %{$$vectors{$property}})
    {
        print "\t\t'$aa' => $vectors->{$property}->{$aa}, \n";
    }
    print "\t},\n";
}
print "};\n\n";


print "# The probability distribution intervals for the $nprops properties.\n";
print "\$intervals = {\n";
for my $property (1..$nprops)
{
    print "\t$property => [";
    print join(', ', @{$ranges->{$property}});
    print "\t],\n";
}
print "};\n\n";


print "# Background frequencies for the $nbins bins.\n";
print "\$natfreqs = {\n";
for my $property (1..$nprops)
{
    my $bin_histogram = create_bin_histogram($ranges->{$property}, $vectors->{$property}, $histogram);
    print "\t$property => [";
    print join(', ', @$bin_histogram);
    print "\t],\n";
}
print "};\n\n";



print "# What bins do the amino acids fall into for each of the properties 1..$nprops.\n";
print "\$bins = {\n";
for my $property (1..$nprops)
{
    my $bin_dist = create_bin_distribution($ranges->{$property}, $vectors->{$property});

    print "\t$property => {";
    for (my $i=0; $i<scalar @$bin_dist; $i++)
    {
        my $bin = $$bin_dist[$i];
        print " ",$i+1,"=>[", join(",",@$bin), "], ";
    }
    print "},\n";
}
print "};\n\n";


print "# How many amino acids were used to create this file and frequencies of the amino acids [%].\n";
$count = 0; 
for my $c (values %$histogram) 
{ 
    $count+=$c; 
}
print "\$created_from = $count;\n\$histogram = {\n\t";
for my $aa (sort keys %$histogram)
{
    if ( $aa eq '-' ) { next }
    my $c = exists($histogram->{$aa}) ? 100*$histogram->{$aa}/$count : 0;
    print " '$aa'=>$c,";
}
print "\n};\n\n";


#   print "# The average property vectors for this amino acid distribution.\n";
#   print "\$avg_props = {\n";
#   for my $property (1..$nprops)
#   {
#       my $value = 0;
#       for my $aa (keys %$histogram)
#       {
#           $value += $$vectors{$property}{$aa} * $$histogram{$aa};
#       }
#       $value /= $count;
#       print "\t$property => $value,\n";
#   }
#   print "\n};\n";

print "\n\n1;\n";


exit;

#---------------------

sub usage
{
    my (@msg) = @_;
    if ( scalar @msg )
    {
        print STDERR join('',@msg);
        exit -1;
    }

    print STDERR 
        "About: This script reads a list of amino acid residues and prints a perl\n",
        "   code formatted as vectorlib.pm. This is useful for determining background\n",
        "   amino acids' probability distributions, which may differ for different species.\n",
        "   The format of the input file can be either a Fasta format, or simply a list.\n",
        "   of one-letter abbreviations.\n",
        "   Note: The SwissProt database can be parsed by the parse-swissprot tool.\n",
        "Usage: create-vectorlib [OPTIONS] < aa-list.txt > new-vectorlib.pm\n",
        "Options:\n",
        "   -a, --alignment <file>          Read the amino acids from the supplied alignment.\n",
        "   -b, --bins <int>                Number of bins of the discrete probability distribution. (Default is $nbins.)\n",
        "   -d, --descriptors <file>        Quantitative descriptors.\n",
        "   -t, --title <string>            Short description of the data origin.\n",
        "   -v, --verbose                   Print warning messages.\n",
        "\n";
    exit -1;
}



sub create_histogram
{
    my ($aln_file) = @_;
    my $aas = '';
    if ( $aln_file )
    {
        my $aln = Alignments::parse_alignment($aln_file);
        for my $seq (@$aln)
        {
            $aas .= $seq->{seq};
        }
    }
    else
    {
        while (my $line=<STDIN>)
        {
            if ( $line =~ /^>/ ) { next; }
            chomp($line);
            $aas .= $line;
        }
    }
    
    my $histogram = {};
    my $ignored   = {};
    my $len = length($aas);
    for (my $i=0; $i<$len; $i++)
    {
        my $aa = substr($aas,$i,1);
        if ( $aa eq '-' || ! exists($vectors->{1}->{$aa}) )
        {
            $ignored->{$aa}++;
            next;
        }

        $histogram->{$aa}++;
    }
    if ( scalar keys %$ignored > 0 )
    {
        print STDERR "Ignoring the following letters:\n" unless !$verbose;
    }
    for my $letter (sort keys %$ignored)
    {
        print STDERR "\t $letter .. $ignored->{$letter}\n" unless !$verbose;
    }
    return $histogram;
}


# This is a replacement of the old static 5 bins - create
#   them on the fly. 
# The returned hash contains $nprops arrays, each of them
#   containing $nbins+1 numbers. These numbers specify
#   the intervals for the $nbins bins.
#
sub create_ranges
{
    my ($vectors,$nprops,$nbins) = @_;
    my $ranges = {};
    for my $property (1..$nprops)
    {
        my ($min,$max);
        for my $value (values %{$vectors->{$property}})
        {
            if ( !$min || $min>$value ) { $min=$value }
            if ( !$max || $max<$value ) { $max=$value }
        }
        my $binlen = ($max-$min) / $nbins;
        for (my $ibin=0; $ibin<=$nbins; $ibin++)
        {
            my $range = $ibin*$binlen + $min;
            if ( $ibin==0 ) { $range -= $binlen*0.1 }       # Extend the limits a little to include the amino acids
            if ( $ibin==$nbins ) { $range += $binlen*0.1 }  #   at the ends.
            push @{$ranges->{$property}}, $range;
        }
    }

    # These ranges where used originally. Uncomment to reproduce original values.
    #
    #   $ranges = {
    #       1 => [-16,-10,-4,2,8,14],
    #       2 => [-11,-5.6,-0.2,5.2,10.6,16],
    #       3 => [-12,-7.6,-3.2,1.2,5.6,10],
    #       4 => [-6,-2.8,0.4,3.6,6.8,10],
    #       5 => [-6.0,-3.6,-1.2,1.2,3.6,6]
    #   };

    return $ranges;
}


# The subroutine takes an histogram (e.g. {'A'=>0,'R'=>210,...}), intervals
#   and vectors (e.g. {'A'=>2.828,'R'=>-5.687, ... }) and creates a cummulative
#   histogram where aminoacids are grouped together according to their property
#   values as defined in $vector.
# The values are returned in an array as fractions (i.e. values from interval 0-1).
#
sub create_bin_histogram
{
    my ($intervals, $vector, $histogram) = @_;
    my $tmp_hist = {};

    my $total_count = 0;
    while (my ($aa,$count) = each %$histogram)
    {
        my $ibin = get_aminoacids_bin($aa,$intervals,$vector);
        $tmp_hist->{$ibin} += $count;
        $total_count += $count;
    }
    my @ret = ();
    for (my $i=0; $i<scalar @$intervals-1; $i++)
    {
        if ( !exists($tmp_hist->{$i}) ) { $ret[$i] = 0; next; }
        $ret[$i] = $tmp_hist->{$i} / $total_count;
    }
    return \@ret;
}


# The subroutine returns a number in the range 0..$nbins-1.
# The input parameters are: 
#   $aa is amino acid (e.g. 'R')
#   $intervals is an array of $nbins+1 numbers
#   $vector is one of the property vectors (e.g. {'A'=>2.828,'R'=>-5.687, ... })
#
sub get_aminoacids_bin
{
    my ($aa,$intervals,$vector) = @_;
    if ( ! exists($vector->{$aa}) ) { Exit::exit_nicely("The amino acid '$aa' is not contained in the property vector.\n") } 

    my $value = $vector->{$aa};
    for (my $i=1; $i<scalar @$intervals; $i++)
    {
        if ( $value > $$intervals[$i-1] && $value <= $$intervals[$i] ) { return $i-1 }
    }
    Exit::exit_nicely("FIXME: The amino acid '$aa' seems to be out of range.\n");
}



sub create_bin_distribution
{
    my ($intervals, $vector) = @_;

    my $bins = {};
    for my $aa (sort keys %$vector)
    {
        if ( $aa eq '-' ) { next }
        my $bin = get_aminoacids_bin($aa,$intervals,$vector);
        push @{$bins->{$bin}}, $aa;
    }

    my @ret = ();
    for (my $i=0; $i<scalar @$intervals-1; $i++)
    {
        if ( ! exists($bins->{$i}) ) { push @ret,[]; next }
        push @ret,$bins->{$i};
    }
    return \@ret;
}


# See descriptors-VB2001-5.txt for an example what this subroutine
#   parses.
sub read_vectors
{
    my ($file) = @_;
    
    open(FILE,"<$file") || usage("$file: $!\n");
    
    my $iline;
    my $linebuf;
    my $vectors = {};
    my ($desc_title, $desc_desc, $nprops);
    my $vector;
    while (my $line=<FILE>)
    {
        $iline++;
        $line =~ s/\r?\n//g;

        $line =~ s/\#.*$//;
        if ( $line =~ /^\s*$/ ) { next }

        if ( $line=~/\\\s*$/ )  # A line escaped by \
        {
            $linebuf .= $`;
            next;
        }
        if ( $linebuf )
        {
            $line = $linebuf . $line;
            $linebuf = '';
        }

        # Now the multilines have been converted to single lines
        if ( $line=~/^\s*nprops\s*=\s*(\d+)/ )
        {
            $nprops = $1;
            next;
        }
        if ( $line=~/^\s*title\s*=\s*(.*)/ )
        {
            $desc_title = $1;
            $desc_title =~ s/\s+/ /g;
            $desc_title =~ s/\s*$//;
            next;
        }
        if ( $line=~/^\s*description\s*=\s*(.*)/ )
        {
            $desc_desc = $1;
            $desc_desc =~ s/\s+/ /g;
            $desc_desc =~ s/\s*$//;
            next;
        }
        if ( $line=~/^(\d+):\s*$/ )
        {
            $vector = $1;
            next;
        }
        if ( $line=~/^\s*(\S)\s*=\s*(\S+)/ )
        {
            if ( !defined($vector) ) 
            { 
                usage("Could not parse line $iline: Seems to be part of a vector definition, but no vector value found.\n$line\n");
            }
            if ( !$vector )
            {
                usage("The vector numbering must go from 1.\n");
            }
            my $aa    = $1;
            my $value = $2;
            $$vectors{$vector}->{$aa} = $value;
            next;
        }
    }
    close(FILE);

    return ($vectors,$nprops,$desc_title,$desc_desc);
}

