1 function [rix,cix] = row_col_deletion(d,c,w)
2 % ROW_COL_DELETION selects the rows and columns for removing any missing values.
3 % A heuristic based on maximizing the number of remaining sample values
4 % is used. In other words, if there are more rows than columns, it is
5 % more likely that a row-wise deletion will be applied and vice versa.
7 % [rix,cix] = row_col_deletion(d)
8 % [rix,cix] = row_col_deletion(d,c,w)
11 % d data (each row is a sample, each column a feature)
12 % c classlabels (not really used) [OPTIONAL]
13 % w weight for each sample vector [OPTIONAL]
15 % rix selected samples
16 % cix selected columns
18 % d(rix,cix) does not contain any NaN's i.e. missing values
20 % see also: TRAIN_SC, TEST_SC
23 % Copyright (C) 2009,2010 by Alois Schloegl <alois.schloegl@gmail.com>
24 % This function is part of the NaN-toolbox
25 % http://pub.ist.ac.at/~schloegl/matlab/NaN/
27 % This program is free software; you can redistribute it and/or
28 % modify it under the terms of the GNU General Public License
29 % as published by the Free Software Foundation; either version 3
30 % of the License, or (at your option) any later version.
32 % This program is distributed in the hope that it will be useful,
33 % but WITHOUT ANY WARRANTY; without even the implied warranty of
34 % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35 % GNU General Public License for more details.
37 % You should have received a copy of the GNU General Public License
38 % along with this program; if not, write to the Free Software
39 % Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.
43 if isempty(w) || all(w==w(1)),
46 ix = ~any(isnan(c) | isnan(w));
48 d = d(ix,:); %% ignore samples with invalid c or w
52 d = d(~isnan(c),:); %% ignore samples with invalid c or w
60 % decides whether row-wise or column-wise deletion removes less data.
61 % rix and cix are the resulting index vectors
62 % either row-wise or column-wise deletion, but not a combination of both, is used.
66 cix = find(~any(isnan(d),1));
67 rix = find(~any(isnan(d),2));
68 nr = length(rix)*size(d,2); % number of elements after row-wise deletion
69 nc = length(cix)*size(d,1); % number of elements after column-wise deletion
72 cix = 1:size(d,2); % select all columns
73 %fprintf(1,'row-wise deletion (%i,%i,%i)\n',n,nr,nc);
75 rix = 1:size(d,1); % select all rows
76 %fprintf(1,'column-wise deletion (%i,%i,%i)\n',n,nr,nc);
81 %% a mix of row- and column-wise deletion is possible
82 if ~isempty(w) && (abs(sum(w)-1) > log2(N)*eps || any(w<0) || any(~isfinite(w)))
83 error('weight vector must contain only non-negative and finite values');
86 rix = ones(N,1); cix = ones(1,M);
88 e = ~isnan(d(rix>0,cix>0));
90 colCost = mean(e, 1, w(rix>0)/sum(w(rix>0)))'; % cost of deleting columns
92 colCost = mean(e, 1)'; % cost of deleting columns
94 rowCost = mean(e, 2); % cost of deleting rows
95 [tmp,ix] = sort([colCost; rowCost]);
97 if abs(tmp(1)-1) < log2(N)*eps, break; end; % stopping criterion
99 if diff(tmp(1:2))==0, warning('row/col deletion: arbitrary selection [%i,%i]',ix(1:2)); end;
106 rix(tmp(ix-sum(cix))) = 0;