#! /usr/bin/perl -w ########################################################################################## # # file: $ENV{HOME}/Cleaned_Scripts/Analysis/decision_alg # last update: 08/05/02 # # The purpose of this script is to output models selected by "decision" algorithm # to a file for all repicants in a simulation # # Before execution of this file you must have the following files available: # # /"home_directory"/Score_Files/Simulation"number"/score_file"number" # # tree files in the form: # # /"home_directory"/Trees/Simulation"number"/Model_Trees/"model".tre # # the result of this script is a file: # # /"home_directory"/Results/Simulation"number"/decision_alg/models_file # # in which each line contains a model selected by the "decision" algorithm ########################################################################################## use strict; use lib "$ENV{HOME}/ProgramFiles"; use Tree; print "Enter the number of the simualtion you want to analyze: "; my $simulation_number = ; chomp $simulation_number; unless ($simulation_number =~ /^\d+$/){ print "Error: not a number!\n"; exit; } print "Enter sample size (DNA sequence length): "; my $sample_size = ; chomp $sample_size; unless ($sample_size =~ /^\d+$/){ print "Error: not a number!\n"; exit; } my $tree_dir = "$ENV{HOME}/VladsWork/Trees/Simulation$simulation_number/"; my $score_file = "$ENV{HOME}/VladsWork/Score_Files/Simulation$simulation_number/"; my $models_file = ">>$ENV{HOME}/VladsWork/Results/Simulation$simulation_number/Decision_Alg/models_file"; my $j=1; while($j <1000){ foreach my $data_set ($j..$j+19) { open MODELS, $models_file or die "Can not open \"$models_file\" ($!)"; my $cur_tree_dir = $tree_dir.'Data_Set_'.$data_set.'/'; my $cur_score_file = $score_file.'score_file_'.$data_set; open SCORE, $cur_score_file or die "Can not open \"$cur_score_file\" ($!)"; my $cur_line = ''; my @log_score = (); my @model_list = ( 'JC', 'JC+I', 'JC+G', 'JC+I+G', 'F81', 'F81+I', 'F81+G', 'F81+I+G', 'K80', 'K80+I', 'K80+G', 'K80+I+G', 'HKY', 'HKY+I', 'HKY+G', 'HKY+I+G', 'TrNef', 'TrNef+I', 'TrNef+G', 'TrNef+I+G', 'TrN', 'TrN+I', 'TrN+G', 'TrN+I+G', 'K81', 'K81+I', 'K81+G', 'K81+I+G', 'K81uf', 'K81uf+I', 'K81uf+G', 'K81uf+I+G', 'TIMef', 'TIMef+I', 'TIMef+G', 'TIMef+I+G', 'TIM', 'TIM+I', 'TIM+G', 'TIM+I+G', 'TVMef', 'TVMef+I', 'TVMef+G', 'TVMef+I+G', 'TVM', 'TVM+I', 'TVM+G', 'TVM+I+G', 'SYM', 'SYM+I', 'SYM+G', 'SYM+I+G', 'GTR', 'GTR+I', 'GTR+G', 'GTR+I+G' ); my @num_of_parameters = (0, 1, 1, 2, 3, 4, 4, 5, 1, 2, 2, 3, 4, 5, 5, 6, 2, 3, 3, 4, 5, 6, 6, 7, 2, 3, 3, 4, 5, 6, 6, 7, 3, 4, 4, 5, 6, 7, 7, 8, 4, 5, 5, 6, 7, 8, 8, 9, 5, 6, 6, 7, 8, 9, 9, 10); ################################# get log scores ############################# foreach (1..56) { $cur_line = ; $cur_line = ; if ($cur_line =~ /1\t([0-9,\.]+)/) { push @log_score, $1; } } close SCORE; ###################### compute BIC ######################## # # BIC = log(likelihood) + (k/2)*log(n), where # k - number of parameters in the model, # n - sample size (DNA sequence length. # ########################################################### my @bic; foreach (1..56) { my $cur_log = shift @log_score; my $cur_num_par = shift @num_of_parameters; push @bic, $cur_log + ($cur_num_par/2)*log($sample_size); } ############## get branch length vectors ################ my %branch_hash; foreach my $model (@model_list){ my $Tree_Dir = $cur_tree_dir.$model.'/'; foreach (@model_list) { my $tree_file = $Tree_Dir.$_.'.tre'; my $tree=''; open TREE, $tree_file or die "Can not open \"$tree_file\" ($!)"; $tree = &get_tree($tree_file); close TREE; #Every one of these arrays is related to a fixed tree following the dir #structure push @{$branch_hash{$model}}, [&get_branches($tree)]; } } ################## compute loss table #################### my @loss; my $j=0; foreach my $model (@model_list) { foreach my $i (0..55) { $loss[$i][$j] = &distance($branch_hash{$model}[$i], $branch_hash{$model}[$j]); } $j+=1 } ################# compute rescaled posterior risk ################## my $min_bic = @bic[&min_index(\@bic)]; my @rescaled_posterior_risk; foreach my $i (0..55) { my $sum = 0; foreach my $j (0..55) { if ($loss[$i][$j] > 0) { my $power = log($loss[$i][$j]) - $bic[$j] + $min_bic; if ($power > -30) { $sum = $sum + exp($power); } } } push @rescaled_posterior_risk, $sum; } ##################### select a model with min post risk ##################### print MODELS $model_list[&min_index(\@rescaled_posterior_risk)], "\n"; close MODELS; print "$data_set\n"; } $j+=20; }