#! /usr/bin/perl ########################################################################################## # # file: $ENV{HOME}/Cleaned_Scripts/Algorithms/pbmsa # last update: 08/13/02 # # The purpose of this script is to analyze the output of Paup and # choose a model according to decision theory # # Usage: perl DT_ModSel [score_file] [tree_file] [sample size] # ########################################################################################## use strict; if ($#ARGV < 2) { print "Usage: perl DT_ModSel [score_file] [tree_file] [sample size]\n"; exit; } my $cur_score_file = shift @ARGV; my $cur_tree_file = shift @ARGV; my $sample_size = shift @ARGV; open SCORE, $cur_score_file or die "Can not open \"$cur_score_file\" ($!)"; my $cur_line = ''; my @log_score = (); my @model_list = ( 'JC', 'JC+I', 'JC+G', 'JC+I+G', 'F81', 'F81+I', 'F81+G', 'F81+I+G', 'K80', 'K80+I', 'K80+G', 'K80+I+G', 'HKY', 'HKY+I', 'HKY+G', 'HKY+I+G', 'TrNef', 'TrNef+I', 'TrNef+G', 'TrNef+I+G', 'TrN', 'TrN+I', 'TrN+G', 'TrN+I+G', 'K81', 'K81+I', 'K81+G', 'K81+I+G', 'K81uf', 'K81uf+I', 'K81uf+G', 'K81uf+I+G', 'TIMef', 'TIMef+I', 'TIMef+G', 'TIMef+I+G', 'TIM', 'TIM+I', 'TIM+G', 'TIM+I+G', 'TVMef', 'TVMef+I', 'TVMef+G', 'TVMef+I+G', 'TVM', 'TVM+I', 'TVM+G', 'TVM+I+G', 'SYM', 'SYM+I', 'SYM+G', 'SYM+I+G', 'GTR', 'GTR+I', 'GTR+G', 'GTR+I+G' ); my @num_of_parameters = (0, 1, 1, 2, 3, 4, 4, 5, 1, 2, 2, 3, 4, 5, 5, 6, 2, 3, 3, 4, 5, 6, 6, 7, 2, 3, 3, 4, 5, 6, 6, 7, 3, 4, 4, 5, 6, 7, 7, 8, 4, 5, 5, 6, 7, 8, 8, 9, 5, 6, 6, 7, 8, 9, 9, 10); ################################# get log scores ############################# foreach (1..56) { $cur_line = ; $cur_line = ; if ($cur_line =~ /1\t([0-9,\.]+)/) { push @log_score, $1; } } close SCORE; ###################### compute BIC ######################## # # BIC = log(likelihood) + (k/2)*log(n), where # k - number of parameters in the model, # n - sample size (DNA sequence length. # ########################################################### my @bic; foreach (1..56) { my $cur_log = shift @log_score; my $cur_num_par = shift @num_of_parameters; push @bic, $cur_log + $cur_num_par/2*log($sample_size); } ############### get all trees ########################### my @model_trees = &get_all_trees($cur_tree_file); ############## get branch length vectors ################ my @branch_vector; foreach (@model_list) { my $tree = shift @model_trees; push @branch_vector, [&get_branches($tree)]; } ################## compute loss table #################### my @loss; foreach my $i (0..55) { foreach my $j (0..55) { $loss[$i][$j] = &distance($branch_vector[$i], $branch_vector[$j]); } } ################# compute rescaled posterior risk ################## my $min_bic = @bic[&min_index(\@bic)]; my @rescaled_posterior_risk; foreach my $i (0..55) { my $sum = 0; foreach my $j (0..55) { if ($loss[$i][$j] > 0) { my $power = log($loss[$i][$j]) - $bic[$j] + $min_bic; if ($power > -30) { $sum = $sum + exp($power); } } } push @rescaled_posterior_risk, $sum; } ##################### select a model with min post risk ##################### print $model_list[&min_index(\@rescaled_posterior_risk)], "\n"; #################################### Subrutines ################################ # input parameters: PAUP* tree file # return value: a sting which contains a tree in Newick format # takes a name of the tree file and parses out the first tree it can find in this file. sub get_tree { my ($tree_file) = @_; open TREE, $tree_file or die "Can not open \"$tree_file\" ($!)"; my $line = ; while (!($line =~ /^tree/)) { $line = ; } close TREE; $line =~ /(\(.*);/; return $1; } # input parameters: a string with a tree in Newick format # return value: an array of branch lengths sub get_branches { my ($tree) = @_; my @br = (); while ($tree =~ /'[^']+'/) { $tree =~ s/'[^']+'//; } while ($tree =~ /([a-zA-Z_0-9.]+):/) { $tree =~ s/[a-zA-Z_0-9.]+:/:/; } while ($tree =~ /:([0-9.e\-+]+)/) { push @br, $1; $tree =~ s/:[0-9.e\-+]+//; } return @br; } # input parameters: two arrays (vestors) # return values: Euclidean distance between them sub distance { my ($array_ref1, $array_ref2)=@_; my @temp_array1 = @$array_ref1; my @temp_array2 = @$array_ref2; my $sum = 0; if (@temp_array1 != @temp_array2) { print "branch length vectors must be of the same length!!!\n"; } else { while (@temp_array1) { my $cur_elem1 = shift @temp_array1; my $cur_elem2 = shift @temp_array2; $sum += ($cur_elem1 - $cur_elem2)*($cur_elem1 - $cur_elem2); } } return sqrt $sum; } # input parameters: array of scalar values # return values: index of the first element with the minimum value sub min_index { my $array_ref = shift; my @array = @$array_ref; my $min_index = 0; foreach my $index (1..$#array) { if ($array[$index] < $array[$min_index]) { $min_index = $index; } } return $min_index; } # input parameters: PAUP* tree file # return value: an array of strings which contain trees in Newick format # takes a name of the tree file and parses out all trees it can find in this file. sub get_all_trees { my ($tree_file) = @_; my @trees = (); open TREE, $tree_file or die "Can not open \"$tree_file\" ($!)"; my $more_trees = 1; while ($more_trees) { my $line = ; while (defined($line) and !($line =~ /^tree/)) { $line = ; } if (defined($line)) { $line =~ /(\(.*);/; push @trees, $1; } else { $more_trees = 0; } } return @trees; }