1 function df = dataframe(x = [], varargin)
4 %# @deftypefn {Function File} @var{df} = dataframe(@var{x = []}, ...)
5 %# This is the default constructor for a dataframe object, which is
6 %# similar to R 'data.frame'. It's a way to group tabular data, then
7 %# accessing them either as matrix or by column name.
8 %# Input argument x may be: @itemize
9 %# @item a dataframe => use @var{varargin} to pad it with suplemental
11 %# @item a matrix => create column names from input name; each column
12 %# is used as an entry
13 %# @item a cell matrix => try to infer column names from the first row,
14 %# and row indexes and names from the two first columns;
15 %# @item a file name => import data into a dataframe;
16 %# @item a matrix of char => initialise colnames from them.
17 %# @item a two-element cell: use the first as column as column to
18 %# append to, and the second as initialiser for the column(s)
20 %# If called with an empty value, or with the default argument, it
21 %# returns an empty dataframe which can be further populated by
22 %# assignement, cat, ... If called without any argument, it should
23 %# return a dataframe from the whole workspace.
24 %# @*Variable input arguments are first parsed as pairs (options, values).
25 %# Recognised options are: @itemize
26 %# @item rownames : take the values as initialiser for row names
27 %# @item colnames : take the values as initialiser for column names
28 %# @item seeked : a (kept) field value which triggers start of processing.
29 %# @item trigger : a (unkept) field value which triggers start of processing.
30 %# Each preceeding line is silently skipped. Default: none
31 %# @item unquot: a logical switch telling wheter or not strings should
32 %# be unquoted before storage, default = true;
33 %# @item sep: the elements separator, default '\t,'
35 %# The remaining data are concatenated (right-appended) to the existing ones.
38 %% Copyright (C) 2009-2012 Pascal Dupuis <Pascal.Dupuis@uclouvain.be>
40 %% This file is part of Octave.
42 %% Octave is free software; you can redistribute it and/or
43 %% modify it under the terms of the GNU General Public
44 %% License as published by the Free Software Foundation;
45 %% either version 2, or (at your option) any later version.
47 %% Octave is distributed in the hope that it will be useful,
48 %% but WITHOUT ANY WARRANTY; without even the implied
49 %% warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
50 %% PURPOSE. See the GNU General Public License for more
53 %% You should have received a copy of the GNU General Public
54 %% License along with Octave; see the file COPYING. If not,
55 %% write to the Free Software Foundation, 51 Franklin Street -
56 %% Fifth Floor, Boston, MA 02110-1301, USA.
59 %# $Id: dataframe.m 9585 2012-02-05 15:32:46Z cdemills $
63 disp ('FIXME -- should create a dataframe from the whole workspace')
68 if (isempty (x) && 1 == nargin)
69 %# default constructor: initialise the fields in the right order
71 df._name = {cell(0, 1), cell(1, 0)}; %# rows - cols
72 df._over = cell (1, 2);
74 df._data = cell (0, 0);
75 df._rep = cell (0, 0); %# a repetition index
76 df._type = cell (0, 0); %# the type of each column
77 df._src = cell (0, 0);
78 df._cmt = cell (0, 0); %# to put comments
79 df = class (df, 'dataframe');
83 if (isa (x, 'dataframe'))
85 elseif (isa (x, 'struct'))
86 df = class (x, 'dataframe'); return
88 df = dataframe ([]); %# get the right fields
92 seeked = []; trigger =[]; unquot = true; sep = "\t,"; cmt_lines = [];
95 if (length(varargin) > 0)
97 %# loop over possible arguments
98 while (indi <= size (varargin, 2))
99 if (isa (varargin{indi}, 'char'))
100 switch(varargin{indi})
102 switch class (varargin{indi+1})
104 df._name{1} = varargin{indi+1};
106 df._name{1} = cellstr (varargin{indi+1});
108 df._name{1} = cellstr (num2str (varargin{indi+1}));
110 df._name{1} = genvarname (df._name{1});
111 df._over{1}(1, 1:length (df._name{1})) = false;
112 df._cnt(1) = size (df._name{1}, 1);
113 df._ridx = (1:df._cnt(1))';
114 varargin(indi:indi+1) = [];
116 switch class(varargin{indi+1})
118 df._name{2} = varargin{indi+1};
120 df._name{2} = cellstr (varargin{indi+1});
122 df._name{2} = cellstr (num2str (varargin{indi+1}));
124 %# detect assignment - functions calls - ranges
125 dummy = cellfun ('size', cellfun (@(x) strsplit (x, ":=("), df._name{2}, \
126 "UniformOutput", false), 2);
128 warning('dataframe colnames taken literally and not interpreted');
130 df._name{2} = genvarname (df._name{2});
131 df._over{2}(1, 1:length (df._name{2})) = false;
132 varargin(indi:indi+1) = [];
134 seeked = varargin{indi + 1};
135 varargin(indi:indi+1) = [];
137 trigger = varargin{indi + 1};
138 varargin(indi:indi+1) = [];
140 unquot = varargin{indi + 1};
141 varargin(indi:indi+1) = [];
143 sep = varargin{indi + 1};
144 varargin(indi:indi+1) = [];
146 locales = varargin{indi + 1};
147 varargin(indi:indi+1) = [];
148 otherwise %# FIXME: just skip it for now
149 disp (sprintf ("Ignoring unkown argument %s", varargin{indi}));
153 indi = indi + 1; %# skip it
158 if (!isempty (seeked) && !isempty (trigger))
159 error ('seeked and trigger are mutuallly incompatible arguments');
163 while (indi <= size(varargin, 2))
165 if (~isa (x, 'dataframe'))
166 if (isa(x, 'char') && size(x, 1) < 2)
167 %# read the data frame from a file
169 dummy = tilde_expand (x);
171 df._src{end+1, 1} = dummy;
173 %# try our own method
174 UTF8_BOM = char([0xEF 0xBB 0xBF]);
176 dummy = tilde_expand (x);
179 df._src{end+1, 1} = dummy;
181 if (!strcmp (dummy, UTF8_BOM))
184 %# slurp everything and convert doubles to char, avoiding
185 %# problems with char > 127
186 in = char (fread (fid).');
190 unwind_protect_cleanup
191 if (fid != -1) fclose (fid); endif
195 %# explicit list taken from 'man pcrepattern' -- we enclose all
196 %# vertical separators in case the underlying regexp engine
197 %# doesn't have them all.
198 eol = '(\r\n|\n|\v|\f|\r|\x85)';
199 %# cut into lines -- include the EOL to have a one-to-one
200 %# matching between line numbers. Use a non-greedy match.
201 lines = regexp (in, ['.*?' eol], 'match');
202 dummy = cellfun (@(x) regexp (x, eol), lines);
203 %# remove the EOL character(s)
204 lines(1 == dummy) = {""};
205 %# use a positive lookahead -- eol is not part of the match
206 lines(dummy > 1) = cellfun (@(x) regexp (x, ['.*?(?=' eol ')'], \
207 'match'), lines(dummy > 1));
208 %# a field either starts at a word boundary, either by + - . for
209 %# a numeric data, either by ' for a string.
211 %# content = cellfun(@(x) regexp(x, '(\b|[-+\.''])[^,]*(''|\b)', 'match'),\
212 %# lines, 'UniformOutput', false); %# extract fields
213 content = cellfun (@(x) strsplit (x, sep), lines, \
214 'UniformOutput', false); %# extract fields
215 indl = 1; indj = 1; %# disp('line 151 '); keyboard
216 if (~isempty (seeked))
217 while (indl <= length (lines))
218 dummy = content{indl};
219 if (all (cellfun ('size', dummy, 2) == 0))
223 dummy = content{indl};
224 if (strcmp (dummy{1}, seeked))
229 elseif (~isempty (trigger))
230 while (indl <= length (lines))
231 dummy = content{indl};
233 if (all (cellfun ('size', dummy, 2) == 0))
236 if (strcmp (dummy{1}, trigger))
241 x = cell (1+length (lines)-indl, size(dummy, 2));
242 empty_lines = []; cmt_lines = [];
243 while (indl <= length(lines))
244 dummy = content{indl};
245 if (all (cellfun ('size', dummy, 2) == 0))
246 empty_lines = [empty_lines indj];
247 indl = indl + 1; indj = indj + 1;
250 %# does it looks like a comment line ?
251 if (regexp (dummy{1}, ['^\s*' char(35)]))
252 empty_lines = [empty_lines indj];
253 cmt_lines = strvcat (cmt_lines, horzcat (dummy{:}));
254 indl = indl + 1; indj = indj + 1;
257 %# try to convert to float
258 the_line = cellfun (@(x) sscanf (x, "%f", locales), dummy, \
259 'UniformOutput', false);
260 for indk = (1:size (the_line, 2))
261 if (isempty (the_line{indk}) || any (size (the_line{indk}) > 1))
262 %#if indi > 1 && indk > 1, disp('line 117 '); keyboard; %#endif
265 %# remove quotes and leading space(s)
266 x(indj, indk) = regexp (dummy{indk}, '[^'' ].*[^'']', 'match'){1};
268 %# if the previous test fails, try a simpler one
269 in = regexp (dummy{indk}, '[^'' ]+', 'match');
271 x(indj, indk) = in{1};
273 %# x(indj, indk) = [];
277 %# no conversion possible, store and remove leading space(s)
278 x(indj, indk) = regexp (dummy{indk}, '[^ ].*', 'match');
281 if (!isempty (regexp (dummy{indk}, '[/:]')))
282 %# try to convert to a date
283 [timeval, nfields] = strptime( dummy{indk},
284 [char(37) 'd/' char(37) 'm/' char(37) 'Y ' char(37) 'T']);
285 if (nfields > 0) %# at least a few fields are OK
286 timestr = strftime ([char(37) 'H:' char(37) 'M:' char(37) 'S'],
288 %# try to extract the usec field, if any
289 idx = regexp (dummy{indk}, timestr, 'end');
292 if (ispunct (dummy{indk}(idx)))
295 timeval.usec = str2num(dummy{indk}(idx:end));
297 x(indj, indk) = str2num (strftime ([char(37) 's'], timeval)) + ...
301 x(indj, indk) = the_line{indk};
305 indl = indl + 1; indj = indj + 1;
307 if (!isempty(empty_lines))
308 x(empty_lines, :) = [];
310 %# detect empty columns
311 empty_lines = find (0 == sum (cellfun ('size', x, 2)));
312 if (!isempty(empty_lines))
313 x(:, empty_lines) = [];
315 clear UTF8_BOM fid in lines indl the_line content empty_lines
316 clear timeval timestr nfields idx
321 %# fallback, avoiding a recursive call
323 if (!isa (x, 'char'))
324 indj = df._cnt(2)+(1:size (x, 2));
326 %# at this point, reading some filename failed
327 error("dataframe: can't open '%s' for reading data", x);
332 %# use the intermediate value as destination column
333 [indc, ncol] = df_name2idx (df._name{2}, x{1}, df._cnt(2), "column");
335 error (["With two-elements cell, the first should resolve " ...
336 "to a single column"]);
339 dummy = cellfun ('class', x{2}(2, :), 'UniformOutput', false);
341 dummy = cellfun ('class', x{2}(1, :), 'UniformOutput', false);
343 df = df_pad (df, 2, [length(dummy) indc], dummy);
345 indj = indc + (1:size(x, 2)); %# redefine target range
347 if (isa (x{1}, 'cell'))
348 x = x{1}; %# remove one cell level
351 if (length (df._name{2}) < indj(1) || isempty (df._name{2}(indj)))
352 [df._name{2}(indj, 1), df._over{2}(1, indj)] ...
353 = df_colnames (inputname(indi), indj);
354 df._name{2} = genvarname (df._name{2});
356 %# allow overwriting of column names
357 df._over{2}(1, indj) = true;
360 if (1 == length (df._name{2}) && length (df._name{2}) < \
362 [df._name{2}(indj, 1), df._over{2}(1, indj)] ...
363 = df_colnames (char(df._name{2}), indj);
364 elseif (length (df._name{2}) < indj(1) || isempty (df._name{2}(indj)))
365 [df._name{2}(indj, 1), df._over{2}(1, indj)] ...
366 = df_colnames (inputname(indi), indj);
368 df._name{2} = genvarname (df._name{2});
372 %# the exact row size will be determined latter
373 idx.subs = {'', indj};
374 %# use direct assignement
375 if (ndims (x) > 2), idx.subs{3} = 1:size (x, 3); endif
376 %# df = subsasgn(df, idx, x); <= call directly lower level
377 df = df_matassign (df, idx, indj, length(indj), x);
378 if (!isempty (cmt_lines))
379 df._cmt = vertcat(df._cmt, cellstr(cmt_lines));
383 df._cnt(2) = length (df._name{2});
386 error ('Concatenating dataframes: use cat instead');
390 %# loop over next variable argument
391 x = varargin{1, indi};
393 %# disp('line 197 ???');
400 function [x, y] = df_colnames(base, num)
401 %# small auxiliary function to generate column names. This is required
402 %# here, as only the constructor can use inputname()
403 if (any ([index(base, "=")]))
404 %# takes the left part as base
405 x = strsplit (base, "=");
413 %# is base most probably a filename ?
414 x = regexp (base, '''[^''].*[^'']''', 'match');
416 if (isvarname (base))
419 x = 'X'; y = true; %# this is a default value, may be changed
427 x = repmat (x, numel (num), 1);
428 x = cstrcat (x, strjust (num2str (num(:)), 'left'));
429 y = repmat (y, 1, numel (num));