1 ## Copyright (C) 2003-2005 Andy Adler <adler@ncf.ca>
3 ## This program is free software; you can redistribute it and/or modify it under
4 ## the terms of the GNU General Public License as published by the Free Software
5 ## Foundation; either version 3 of the License, or (at your option) any later
8 ## This program is distributed in the hope that it will be useful, but WITHOUT
9 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 ## You should have received a copy of the GNU General Public License along with
14 ## this program; if not, see <http://www.gnu.org/licenses/>.
17 ## @deftypefn {Function File} {[@var{pval}, @var{f}, @var{df_b}, @var{df_e}] =} anovan (@var{data}, @var{grps})
18 ## @deftypefnx {Function File} {[@var{pval}, @var{f}, @var{df_b}, @var{df_e}] =} anovan (@var{data}, @var{grps}, 'param1', @var{value1})
19 ## Perform a multi-way analysis of variance (ANOVA). The goal is to test
20 ## whether the population means of data taken from @var{k} different
21 ## groups are all equal.
23 ## Data is a single vector @var{data} with groups specified by
24 ## a corresponding matrix of group labels @var{grps}, where @var{grps}
25 ## has the same number of rows as @var{data}. For example, if
26 ## @var{data} = [1.1;1.2]; @var{grps}= [1,2,1; 1,5,2];
27 ## then data point 1.1 was measured under conditions 1,2,1 and
28 ## data point 1.2 was measured under conditions 1,5,2.
29 ## Note that groups do not need to be sequentially numbered.
31 ## By default, a 'linear' model is used, computing the N main effects
32 ## with no interactions. this may be modified by param 'model'
34 ## p= anovan(data,groups, 'model', modeltype)
35 ## - modeltype = 'linear': compute N main effects
36 ## - modeltype = 'interaction': compute N effects and
37 ## N*(N-1) two-factor interactions
38 ## - modeltype = 'full': compute interactions at all levels
40 ## Under the null of constant means, the statistic @var{f} follows an F
41 ## distribution with @var{df_b} and @var{df_e} degrees of freedom.
43 ## The p-value (1 minus the CDF of this distribution at @var{f}) is
44 ## returned in @var{pval}.
46 ## If no output argument is given, the standard one-way ANOVA table is
49 ## BUG: DFE is incorrect for modeltypes != full
52 ## Author: Andy Adler <adler@ncf.ca>
53 ## Based on code by: KH <Kurt.Hornik@ci.tuwien.ac.at>
54 ## $Id: anovan.m 10203 2012-04-12 13:47:32Z carandraug $
57 ## 1. ANOVA ACCURACY: www.itl.nist.gov/div898/strd/anova/anova.html
58 ## Passes 'easy' test. Comes close on 'Average'. Fails 'Higher'.
59 ## This could be fixed with higher precision arithmetic
60 ## 2. Matlab anova2 test
61 ## www.mathworks.com/access/helpdesk/help/toolbox/stats/anova2.html
63 ## popcorn= [ 5.5 4.5 3.5; 5.5 4.5 4.0; 6.0 4.0 3.0;
64 ## 6.5 5.0 4.0; 7.0 5.5 5.0; 7.0 5.0 4.5];
65 ## % Define groups so reps = 3
66 ## groups = [ 1 1;1 2;1 3;1 1;1 2;1 3;1 1;1 2;1 3;
67 ## 2 1;2 2;2 3;2 1;2 2;2 3;2 1;2 2;2 3 ];
68 ## anovan( vec(popcorn'), groups, 'model', 'full')
69 ## % Results same as Matlab output
70 ## 3. Matlab anovan test
71 ## www.mathworks.com/access/helpdesk/help/toolbox/stats/anovan.html
73 ## y = [52.7 57.5 45.9 44.5 53.0 57.0 45.9 44.0]';
74 ## g1 = [1 2 1 2 1 2 1 2];
75 ## g2 = {'hi';'hi';'lo';'lo';'hi';'hi';'lo';'lo'};
76 ## g3 = {'may'; 'may'; 'may'; 'may'; 'june'; 'june'; 'june'; 'june'};
77 ## anovan( y', [g1',g2',g3'])
78 ## % Fails because we always do interactions
80 function [PVAL, FSTAT, DF_B, DFE] = anovan (data, grps, varargin)
83 usage ("anovan (data, grps)");
86 # test supplied parameters
89 param= varargin{idx-2};
90 value= varargin{idx-1};
92 if strcmp(param, 'model')
94 # elseif strcmp(param # add other parameters here
96 error(sprintf('parameter %s is not supported', param));
101 error ("anova: for `anova (data, grps)', data must be a vector");
104 nd = size (grps,1); # number of data points
105 nw = size (grps,2); # number of anova "ways"
106 if (~ isvector (data) || (length(data) ~= nd))
107 error ("anova: grps must be a matrix of the same number of rows as data");
110 [g,grp_map] = relabel_groups (grps);
111 if strcmp(modeltype, 'linear')
113 elseif strcmp(modeltype,'interaction')
115 elseif strcmp(modeltype,'full')
116 max_interact = rows(grps);
118 error(sprintf('modeltype %s is not supported', modeltype));
120 ng = length(grp_map);
121 int_tbl = interact_tbl (nw, ng, max_interact );
122 [gn, gs, gss] = raw_sums(data, g, ng, int_tbl);
124 stats_tbl = int_tbl(2:size(int_tbl,1),:)>0;
125 nstats= size(stats_tbl,1);
126 stats= zeros( nstats+1, 5); # SS, DF, MS, F, p
128 [SS, DF, MS]= factor_sums( gn, gs, gss, stats_tbl(i,:), ng, nw);
129 stats(i,1:3)= [SS, DF, MS];
132 # The Mean squared error is the data - avg for each possible measurement
133 # This calculation doesn't work unless there is replication for all grps
134 # SSE= sum( gss(sel) ) - sum( gs(sel).^2 ./ gn(sel) );
135 SST= gss(1) - gs(1)^2/gn(1);
136 SSE= SST - sum(stats(:,1));
137 sel = select_pat( ones(1,nw), ng, nw); %incorrect for modeltypes != full
138 DFE= sum( (gn(sel)-1).*(gn(sel)>0) );
140 stats(nstats+1,1:3)= [SSE, DFE, MSE];
146 pval = 1 - fcdf (F, DF, DFE);
147 stats(i,4:5)= [F, pval];
151 printout( stats, stats_tbl );
153 PVAL= stats(1:nstats,5);
154 FSTAT=stats(1:nstats,4);
155 DF_B= stats(1:nstats,2);
161 # relabel groups to a mapping from 1 to ng
163 # grps input grouping
165 # g relabelled grouping
166 # grp_map map from output to input grouping
167 function [g,grp_map] = relabel_groups(grps)
170 uniq = 1+[0;find(diff(s))];
171 # mapping from new grps to old groups
174 ngroups= length(uniq);
175 g= zeros(size(grp_vec));
177 g( find( grp_vec== grp_map(i) ) ) = i;
179 g= reshape(g, size(grps));
182 # Create interaction table
185 # nw number of "ways"
186 # ng number of ANOVA groups
187 # max_interact maximum number of interactions to consider
189 function int_tbl =interact_tbl(nw, ng, max_interact)
191 inter_tbl= zeros( combin, nw);
194 inter_tbl(:,i) = ( rem(idx,2^i) >= 2^(i-1) );
197 # find elements with more than max_interact 1's
198 idx = ( sum(inter_tbl',1) > max_interact );
199 inter_tbl(idx,:) =[];
200 combin= size(inter_tbl,1); # update value
203 # use ng+1 to map combinations of groups to integers
204 # this would be lots easier with a hash data structure
205 int_tbl = inter_tbl .* (ones(combin,1) * (ng+1).^(0:nw-1) );
208 # Calculate sums for each combination
211 # g relabelled grouping matrix
212 # ng number of ANOVA groups
215 # Output (virtual (ng+1)x(nw) matrices):
216 # gn number of data sums in each group
217 # gs sum of data in each group
218 # gss sumsqr of data in each group
219 function [gn, gs, gss] = raw_sums(data, g, ng, int_tbl);
222 gn= gs= gss= zeros((ng+1)^nw, 1);
224 # need offset by one for indexing
226 idx = 1+ int_tbl*g(i,:)';
233 # Calcualte the various factor sums
235 # gn number of data sums in each group
236 # gs sum of data in each group
237 # gss sumsqr of data in each group
238 # select binary vector of factor for this "way"?
239 # ng number of ANOVA groups
242 function [SS,DF]= raw_factor_sums( gn, gs, gss, select, ng, nw);
243 sel= select_pat( select, ng, nw);
244 ss_raw= gs(sel).^2 ./ gn(sel);
245 SS= sum( ss_raw( ~isnan(ss_raw) ));
246 if length(find(select>0))==1
247 DF= sum(gn(sel)>0)-1;
249 DF= 1; #this isn't the real DF, but needed to multiply
253 function [SS, DF, MS]= factor_sums( gn, gs, gss, select, ng, nw);
259 # zero terms added, one term subtracted, two added, etc
261 remove= find( rem( floor( i * 2.^(-lff+1:0) ), 2) );
264 sel1( ff( remove ) )=0;
266 [raw_sum,raw_df]= raw_factor_sums(gn,gs,gss,sel1,ng,nw);
268 add_sub= (-1)^length(remove);
269 SS+= add_sub*raw_sum;
276 # Calcualte the various factor sums
278 # select binary vector of factor for this "way"?
279 # ng number of ANOVA groups
281 function sel= select_pat( select, ng, nw);
282 # if select(i) is zero, remove nonzeros
283 # if select(i) is zero, remove zero terms for i
286 if length(select) ~= nw;
287 error("length of select must be = nw");
292 # expand 0:(ng+1)^nw in base ng+1
293 field= (0:(ng1)^nw-1)'* ng1.^(-nw+1:0);
294 field= rem( floor( field), ng1);
295 # select zero or non-zero elements
298 sel= find( all( field == ones(ng1^nw,1)*select(:)', 2) );
302 function printout( stats, stats_tbl );
303 nw= size( stats_tbl,2);
304 [jnk,order]= sort( sum(stats_tbl,2) );
306 printf('\n%d-way ANOVA Table (Factors A%s):\n\n', nw, ...
307 sprintf(',%c',toascii('A')+(1:nw-1)) );
308 printf('Source of Variation Sum Sqr df MeanSS Fval p-value\n');
309 printf('*********************************************************************\n');
310 printf('Error %10.2f %4d %10.2f\n', stats( size(stats,1),1:3));
313 str= sprintf(' %c x',toascii('A')+find(stats_tbl(i,:)>0)-1 );
314 str= str(1:length(str)-2); # remove x
315 printf('Factor %15s %10.2f %4d %10.2f %7.3f %7.6f\n', ...
322 # Test Data from http://maths.sci.shu.ac.uk/distance/stats/14.shtml
323 data=[7 9 9 8 12 10 ...
326 grp = [1,1; 1,1; 1,2; 1,2; 1,3; 1,3;
327 2,1; 2,1; 2,2; 2,2; 2,3; 2,3;
328 3,1; 3,1; 3,2; 3,2; 3,3; 3,3];
329 data=[7 9 9 8 12 10 9 8 ...
330 9 8 10 11 13 13 10 11 ...
331 9 10 10 12 10 12 10 12]';
332 grp = [1,4; 1,4; 1,5; 1,5; 1,6; 1,6; 1,7; 1,7;
333 2,4; 2,4; 2,5; 2,5; 2,6; 2,6; 2,7; 2,7;
334 3,4; 3,4; 3,5; 3,5; 3,6; 3,6; 3,7; 3,7];
335 # Test Data from http://maths.sci.shu.ac.uk/distance/stats/9.shtml
336 data=[9.5 11.1 11.9 12.8 ...
337 10.9 10.0 11.0 11.9 ...
338 11.2 10.4 10.8 13.4]';
340 # Test Data from http://maths.sci.shu.ac.uk/distance/stats/13.shtml
341 data=[7.56 9.68 11.65 ...
351 # Test Data from www.mathworks.com/
352 # access/helpdesk/help/toolbox/stats/linear10.shtml
353 data=[23 27 43 41 15 17 3 9 20 63 55 90];
354 grp= [ 1 1 1 1 2 2 2 2 3 3 3 3;
355 1 1 2 2 1 1 2 2 1 1 2 2]';