octave_packages/nnet-0.1.13/subset.m

   1 ## Copyright (C) 2008 Michel D. Schmid  <michaelschmid@users.sourceforge.net>
   2 ##
   3 ##
   4 ## This program is free software; you can redistribute it and/or modify it
   5 ## under the terms of the GNU General Public License as published by
   6 ## the Free Software Foundation; either version 2, or (at your option)
   7 ## any later version.
   8 ##
   9 ## This program is distributed in the hope that it will be useful, but
  10 ## WITHOUT ANY WARRANTY; without even the implied warranty of
  11 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12 ## General Public License for more details.
  13 ##
  14 ## You should have received a copy of the GNU General Public License
  15 ## along with this program; see the file COPYING.  If not, see
  16 ## <http://www.gnu.org/licenses/>.
  17
  18 ## -*- texinfo -*-
  19 ## @deftypefn {Function File} {}[@var{mTrain}, @var{mTest}, @var{mVali}] = subset (@var{mData},@var{nTargets},@var{iOpti},@var{fTest},@var{fVali})
  20 ## @code{subset} splits the main data matrix which contains inputs and targets into 2 or 3 subsets
  21 ## depending on the parameters.
  22 ##
  23 ## The first parameter @var{mData} must be in row order. This means if the network
  24 ## contains three inputs, the matrix must be have 3 rows and x columns to define the
  25 ## data for the inputs. And some more rows for the outputs (targets), e.g. a neural network
  26 ## with three inputs and two outputs must have 5 rows with x columns!
  27 ## The second parameter @var{nTargets} defines the number or rows which contains the target values!
  28 ## The third argument @code{iOpti} is optional and can have three status:
  29 ##         0: no optimization
  30 ##     1: will randomise the column order and order the columns containing min and max values to be in the train set
  31 ##     2: will NOT randomise the column order, but order the columns containing min and max values to be in the train set
  32 ##         default value is @code{1}
  33 ## The fourth argument @code{fTest} is also optional and defines how
  34 ## much data sets will be in the test set. Default value is @code{1/3}
  35 ## The fifth parameter @code{fTrain} is also optional and defines how
  36 ## much data sets will be in the train set. Default value is @code{1/6}
  37 ## So we have 50% of all data sets which are for training with the default values.
  38 ##
  39 ## @example
  40 ##   [mTrain, mTest] = subset(mData,1)
  41 ##   returns three subsets of the complete matrix
  42 ##   with randomized and optimized columns!
  43 ## @end example
  44 ## @example
  45 ##   [mTrain, mTest] = subset(mData,1,)
  46 ##   returns two subsets
  47 ## @end example
  48 ##
  49 ## @end deftypefn
  50
  51 ## Author: Michel D. Schmid
  52
  53
  54 function [mTrain, mTest, mVali] = subset(mData,nTargets,iOpti,fTest,fVali)
  55
  56   ## check range of input arguments
  57   error(nargchk(2,5,nargin))
  58
  59   ## check the input arguments ...!
  60   if (nTargets==0)
  61     error("No TARGETS defined! This doesn't make any sense for feed-forward neural networks! Please define at least one row of targets")
  62   endif
  63
  64   ## set default values
  65   if (nargin==2)
  66     iOpti = 1;
  67     fTest = 1/3;
  68     fVali = 1/6;
  69   elseif (nargin==3)
  70     fTest = 1/3;
  71     fVali = 1/6;
  72   elseif (nargin==4)
  73     ## if fTest is set and nothing is set
  74     ## for fVali I assume that fVali is not used!
  75     fVali = 0;
  76   endif
  77
  78   ## calculate the number of train, test and validation sets
  79   fTrain = 1-fTest-fVali;
  80   nTrainSets = floor(size(mData,2)*fTrain);
  81   diffRestSets = size(mData,2)-nTrainSets;
  82   nTestSets = floor(size(mData,2)*fTest);
  83   nValiSets = size(mData,2)-nTrainSets-nTestSets;
  84
  85
  86   ## now let's see if matrix must be optimized!
  87   bOptiAgain = 1;
  88   while (bOptiAgain)
  89     if (iOpti == 1)
  90     # check that only one optimizing run is enough!!
  91     # maybe it's necessary to do it twice ..!
  92     # check that all min and max values are in the train set ...!
  93       mData = __optimizedatasets(mData,nTrainSets,nTargets,iOpti);
  94       mTrain = mData(:,1:nTrainSets);
  95       iRuns = size(mTrain,1);
  96       i = 1;
  97       j = 1;
  98       while (i < iRuns)
  99           if ( max(mTrain(i,:)) == max(mData(i,:)) )
 100             j += 1;
 101           endif
 102           i +=1;
 103       endwhile
 104       if (i==j)
 105         bOptiAgain = 0;
 106       endif
 107     elseif (iOpti == 2)
 108       # check that only one optimizing run is enough!!
 109       # maybe it's necessary to do it twice ..!
 110       # check that all min and max values are in the train set ...!
 111       mData = __optimizedatasets(mData,nTrainSets,nTargets,iOpti);
 112       mTrain = mData(:,1:nTrainSets);
 113       iRuns = size(mTrain,1);
 114       j = 1;
 115       i = 1;
 116       while (i < iRuns)
 117           if (max(mTrain(i,:))==max(mData(i,:)))
 118                         j += 1;
 119           endif
 120           i += 1;
 121       endwhile
 122       if (i==j)
 123         bOptiAgain = 0;
 124       endif
 125     else
 126       ## in this case, iOpti must be 0 ==> nothing todo
 127       bOptiAgain = 0;
 128     endif
 129   endwhile #END OF while(bOptiAgain)
 130
 131   ## now split up
 132   if (nargout==1)
 133     mTrain = mData;
 134   elseif (nargout==2);
 135     mTrain = mData(:,1:nTrainSets);
 136     mTest = mData(:,nTrainSets+1:nTrainSets+nTestSets);
 137   elseif (nargout==3)
 138     mTrain = mData(:,1:nTrainSets);
 139     mTest = mData(:,nTrainSets+1:nTrainSets+nTestSets);
 140     mVali = mData(:,nTrainSets+nTestSets+1:end);
 141   endif
 142
 143 endfunction
 144
 145 %!shared matrix, nTargets, mTrain, mTest, mVali
 146 %! disp("testing subset")
 147 %! matrix = [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 18 20; \
 148 %!                       0 2 4 1 3 5 3 4 1 -1 -2 -9 -1 10 12 20 11 11 11 11; \
 149 %!                      -2 2 2 2 2 0 0 0 0  0 10 12 13 12 13 44 33 32 98 11; \
 150 %!                       0 0 0 0 1 1 1 1 0  0  1  1  1  0  0  1  1  1  0  0; \
 151 %!           4 4 4 4 4 4 4 4 4  4  4  4  4  4  4  4  4  4  4  4; \
 152 %!           1 2 3 4 5 6 7 8 9 10 11 12 13 33 44 55 66 77 88 99];
 153 %! nTargets = 1; # the last row is equivalent to the target values.
 154 %! [mTrain, mTest, mVali] = subset(matrix,nTargets);  ############################
 155 %!assert(size(mTrain,2)==10);# 50% of 20
 156 %!assert(size(mTest,2)==6);# 1/3 of 20 = 6 (floor)
 157 %!assert(size(mVali,2)==4);# 1/6 of 20 = 4 (floor)
 158 %! # It's not possible to test the column order with this call!
 159 %! # randomizing is used! But all max and min values should be
 160 %! # in the training set
 161 %!assert(max(mTrain(1,:))==max(matrix(1,:)));
 162 %!assert(min(mTrain(1,:))==min(matrix(1,:)));
 163 %!assert(max(mTrain(2,:))==max(matrix(2,:)));
 164 %!assert(min(mTrain(2,:))==min(matrix(2,:)));
 165 %!assert(max(mTrain(3,:))==max(matrix(3,:)));
 166 %!assert(min(mTrain(3,:))==min(matrix(3,:)));
 167 %!assert(max(mTrain(4,:))==max(matrix(4,:)));
 168 %!assert(min(mTrain(4,:))==min(matrix(4,:)));
 169 %!
 170 %!
 171 %! [mTrain, mTest, mVali] = subset(matrix,nTargets,0);  ############################
 172 %!assert(size(mTrain,2)==10);# 50% of 20
 173 %!assert(size(mTest,2)==6);# 1/3 of 20 = 6 (floor)
 174 %!assert(size(mVali,2)==4);# 1/6 of 20 = 4 (floor)
 175 %!assert(mTrain==matrix(:,1:10));
 176 %!assert(mTest==matrix(:,11:16));
 177 %!assert(mVali==matrix(:,17:20));
 178 %!
 179 %!
 180 %! [mTrain, mTest, mVali] = subset(matrix,nTargets,2);  ############################
 181 %!assert(size(mTrain,2)==10);# 50% of 20
 182 %!assert(size(mTest,2)==6);# 1/3 of 20 = 6 (floor)
 183 %!assert(size(mVali,2)==4);# 1/6 of 20 = 4 (floor)
 184 %!assert(max(mTrain(1,:))==max(matrix(1,:)));
 185 %!assert(min(mTrain(1,:))==min(matrix(1,:)));
 186 %!assert(max(mTrain(2,:))==max(matrix(2,:)));
 187 %!assert(min(mTrain(2,:))==min(matrix(2,:)));
 188 %!assert(max(mTrain(3,:))==max(matrix(3,:)));
 189 %!assert(min(mTrain(3,:))==min(matrix(3,:)));
 190 %!assert(max(mTrain(4,:))==max(matrix(4,:)));
 191 %!assert(min(mTrain(4,:))==min(matrix(4,:)));
 192 %!
 193 %!
 194 %! ## next test ... optimize twice
 195 %! matrix = [1 2 3 4 5 6 7 20 8 10 11 12 13 14 15 16 17 18 18 9; \
 196 %!                       0 2 4 1 3 5 3 4 1 -1 -2 -9 -1 10 12 20 11 11 11 11; \
 197 %!                      -2 2 2 2 2 0 0 0 0  0 10 12 13 12 13 44 33 32 98 11; \
 198 %!                       0 0 0 0 1 1 1 1 0  0  1  1  1  0  0  1  1  1  0  0; \
 199 %!           4 4 4 4 4 4 4 4 4  4  4  4  4  4  4  4  4  4  4  4; \
 200 %!           1 2 3 4 5 6 7 8 9 10 11 12 13 33 44 55 66 77 88 99];
 201 %! [mTrain, mTest, mVali] = subset(matrix,nTargets,2);  ############################
 202 %!assert(max(mTrain(1,:))==max(matrix(1,:)));
 203 %!assert(min(mTrain(1,:))==min(matrix(1,:)));
 204 %!assert(max(mTrain(2,:))==max(matrix(2,:)));
 205 %!assert(min(mTrain(2,:))==min(matrix(2,:)));
 206 %!assert(max(mTrain(3,:))==max(matrix(3,:)));
 207 %!assert(min(mTrain(3,:))==min(matrix(3,:)));
 208 %!assert(max(mTrain(4,:))==max(matrix(4,:)));
 209 %!assert(min(mTrain(4,:))==min(matrix(4,:)));
 210
 211 ## \todo, a lot of tests to be sure, everything is working OK!!
 212 ## all combinations of arguments must be testet!