X-Git-Url: https://git.creatis.insa-lyon.fr/pubgit/?p=CreaPhase.git;a=blobdiff_plain;f=octave_packages%2Fdataframe-0.9.1%2F%40dataframe%2Fdataframe.m;fp=octave_packages%2Fdataframe-0.9.1%2F%40dataframe%2Fdataframe.m;h=9c2722e08db9e1d58fe930350686ca978bf65666;hp=0000000000000000000000000000000000000000;hb=f5f7a74bd8a4900f0b797da6783be80e11a68d86;hpb=1705066eceaaea976f010f669ce8e972f3734b05 diff --git a/octave_packages/dataframe-0.9.1/@dataframe/dataframe.m b/octave_packages/dataframe-0.9.1/@dataframe/dataframe.m new file mode 100644 index 0000000..9c2722e --- /dev/null +++ b/octave_packages/dataframe-0.9.1/@dataframe/dataframe.m @@ -0,0 +1,434 @@ +function df = dataframe(x = [], varargin) + + %# -*- texinfo -*- + %# @deftypefn {Function File} @var{df} = dataframe(@var{x = []}, ...) + %# This is the default constructor for a dataframe object, which is + %# similar to R 'data.frame'. It's a way to group tabular data, then + %# accessing them either as matrix or by column name. + %# Input argument x may be: @itemize + %# @item a dataframe => use @var{varargin} to pad it with suplemental + %# columns + %# @item a matrix => create column names from input name; each column + %# is used as an entry + %# @item a cell matrix => try to infer column names from the first row, + %# and row indexes and names from the two first columns; + %# @item a file name => import data into a dataframe; + %# @item a matrix of char => initialise colnames from them. + %# @item a two-element cell: use the first as column as column to + %# append to, and the second as initialiser for the column(s) + %# @end itemize + %# If called with an empty value, or with the default argument, it + %# returns an empty dataframe which can be further populated by + %# assignement, cat, ... If called without any argument, it should + %# return a dataframe from the whole workspace. + %# @*Variable input arguments are first parsed as pairs (options, values). + %# Recognised options are: @itemize + %# @item rownames : take the values as initialiser for row names + %# @item colnames : take the values as initialiser for column names + %# @item seeked : a (kept) field value which triggers start of processing. + %# @item trigger : a (unkept) field value which triggers start of processing. + %# Each preceeding line is silently skipped. Default: none + %# @item unquot: a logical switch telling wheter or not strings should + %# be unquoted before storage, default = true; + %# @item sep: the elements separator, default '\t,' + %# @end itemize + %# The remaining data are concatenated (right-appended) to the existing ones. + %# @end deftypefn + + %% Copyright (C) 2009-2012 Pascal Dupuis + %% + %% This file is part of Octave. + %% + %% Octave is free software; you can redistribute it and/or + %% modify it under the terms of the GNU General Public + %% License as published by the Free Software Foundation; + %% either version 2, or (at your option) any later version. + %% + %% Octave is distributed in the hope that it will be useful, + %% but WITHOUT ANY WARRANTY; without even the implied + %% warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + %% PURPOSE. See the GNU General Public License for more + %% details. + %% + %% You should have received a copy of the GNU General Public + %% License along with Octave; see the file COPYING. If not, + %% write to the Free Software Foundation, 51 Franklin Street - + %% Fifth Floor, Boston, MA 02110-1301, USA. + + %# + %# $Id: dataframe.m 9585 2012-02-05 15:32:46Z cdemills $ + %# + +if (0 == nargin) + disp ('FIXME -- should create a dataframe from the whole workspace') + df = dataframe ([]); + return +endif + +if (isempty (x) && 1 == nargin) + %# default constructor: initialise the fields in the right order + df._cnt = [0 0]; + df._name = {cell(0, 1), cell(1, 0)}; %# rows - cols + df._over = cell (1, 2); + df._ridx = []; + df._data = cell (0, 0); + df._rep = cell (0, 0); %# a repetition index + df._type = cell (0, 0); %# the type of each column + df._src = cell (0, 0); + df._cmt = cell (0, 0); %# to put comments + df = class (df, 'dataframe'); + return +endif + +if (isa (x, 'dataframe')) + df = x; +elseif (isa (x, 'struct')) + df = class (x, 'dataframe'); return +else + df = dataframe ([]); %# get the right fields +endif + +%# default values +seeked = []; trigger =[]; unquot = true; sep = "\t,"; cmt_lines = []; +locales = "C"; + +if (length(varargin) > 0) + indi = 1; + %# loop over possible arguments + while (indi <= size (varargin, 2)) + if (isa (varargin{indi}, 'char')) + switch(varargin{indi}) + case 'rownames' + switch class (varargin{indi+1}) + case {'cell'} + df._name{1} = varargin{indi+1}; + case {'char'} + df._name{1} = cellstr (varargin{indi+1}); + otherwise + df._name{1} = cellstr (num2str (varargin{indi+1})); + endswitch + df._name{1} = genvarname (df._name{1}); + df._over{1}(1, 1:length (df._name{1})) = false; + df._cnt(1) = size (df._name{1}, 1); + df._ridx = (1:df._cnt(1))'; + varargin(indi:indi+1) = []; + case 'colnames' + switch class(varargin{indi+1}) + case {'cell'} + df._name{2} = varargin{indi+1}; + case {'char'} + df._name{2} = cellstr (varargin{indi+1}); + otherwise + df._name{2} = cellstr (num2str (varargin{indi+1})); + endswitch + %# detect assignment - functions calls - ranges + dummy = cellfun ('size', cellfun (@(x) strsplit (x, ":=("), df._name{2}, \ + "UniformOutput", false), 2); + if (any(dummy > 1)) + warning('dataframe colnames taken literally and not interpreted'); + endif + df._name{2} = genvarname (df._name{2}); + df._over{2}(1, 1:length (df._name{2})) = false; + varargin(indi:indi+1) = []; + case 'seeked', + seeked = varargin{indi + 1}; + varargin(indi:indi+1) = []; + case 'trigger', + trigger = varargin{indi + 1}; + varargin(indi:indi+1) = []; + case 'unquot', + unquot = varargin{indi + 1}; + varargin(indi:indi+1) = []; + case 'sep', + sep = varargin{indi + 1}; + varargin(indi:indi+1) = []; + case 'locales' + locales = varargin{indi + 1}; + varargin(indi:indi+1) = []; + otherwise %# FIXME: just skip it for now + disp (sprintf ("Ignoring unkown argument %s", varargin{indi})); + indi = indi + 1; + endswitch + else + indi = indi + 1; %# skip it + endif + endwhile +endif + +if (!isempty (seeked) && !isempty (trigger)) + error ('seeked and trigger are mutuallly incompatible arguments'); +endif + +indi = 0; +while (indi <= size(varargin, 2)) + indi = indi + 1; + if (~isa (x, 'dataframe')) + if (isa(x, 'char') && size(x, 1) < 2) + %# read the data frame from a file + try + dummy = tilde_expand (x); + x = load (dummy); + df._src{end+1, 1} = dummy; + catch + %# try our own method + UTF8_BOM = char([0xEF 0xBB 0xBF]); + unwind_protect + dummy = tilde_expand (x); + fid = fopen (dummy); + if (fid != -1) + df._src{end+1, 1} = dummy; + dummy = fgetl (fid); + if (!strcmp (dummy, UTF8_BOM)) + frewind (fid); + endif + %# slurp everything and convert doubles to char, avoiding + %# problems with char > 127 + in = char (fread (fid).'); + else + in = []; + endif + unwind_protect_cleanup + if (fid != -1) fclose (fid); endif + end_unwind_protect + + if (!isempty (in)) + %# explicit list taken from 'man pcrepattern' -- we enclose all + %# vertical separators in case the underlying regexp engine + %# doesn't have them all. + eol = '(\r\n|\n|\v|\f|\r|\x85)'; + %# cut into lines -- include the EOL to have a one-to-one + %# matching between line numbers. Use a non-greedy match. + lines = regexp (in, ['.*?' eol], 'match'); + dummy = cellfun (@(x) regexp (x, eol), lines); + %# remove the EOL character(s) + lines(1 == dummy) = {""}; + %# use a positive lookahead -- eol is not part of the match + lines(dummy > 1) = cellfun (@(x) regexp (x, ['.*?(?=' eol ')'], \ + 'match'), lines(dummy > 1)); + %# a field either starts at a word boundary, either by + - . for + %# a numeric data, either by ' for a string. + + %# content = cellfun(@(x) regexp(x, '(\b|[-+\.''])[^,]*(''|\b)', 'match'),\ + %# lines, 'UniformOutput', false); %# extract fields + content = cellfun (@(x) strsplit (x, sep), lines, \ + 'UniformOutput', false); %# extract fields + indl = 1; indj = 1; %# disp('line 151 '); keyboard + if (~isempty (seeked)) + while (indl <= length (lines)) + dummy = content{indl}; + if (all (cellfun ('size', dummy, 2) == 0)) + indl = indl + 1; + continue; + endif + dummy = content{indl}; + if (strcmp (dummy{1}, seeked)) + break; + endif + indl = indl + 1; + endwhile + elseif (~isempty (trigger)) + while (indl <= length (lines)) + dummy = content{indl}; + indl = indl + 1; + if (all (cellfun ('size', dummy, 2) == 0)) + continue; + endif + if (strcmp (dummy{1}, trigger)) + break; + endif + endwhile + endif + x = cell (1+length (lines)-indl, size(dummy, 2)); + empty_lines = []; cmt_lines = []; + while (indl <= length(lines)) + dummy = content{indl}; + if (all (cellfun ('size', dummy, 2) == 0)) + empty_lines = [empty_lines indj]; + indl = indl + 1; indj = indj + 1; + continue; + endif + %# does it looks like a comment line ? + if (regexp (dummy{1}, ['^\s*' char(35)])) + empty_lines = [empty_lines indj]; + cmt_lines = strvcat (cmt_lines, horzcat (dummy{:})); + indl = indl + 1; indj = indj + 1; + continue; + endif + %# try to convert to float + the_line = cellfun (@(x) sscanf (x, "%f", locales), dummy, \ + 'UniformOutput', false); + for indk = (1:size (the_line, 2)) + if (isempty (the_line{indk}) || any (size (the_line{indk}) > 1)) + %#if indi > 1 && indk > 1, disp('line 117 '); keyboard; %#endif + if (unquot) + try + %# remove quotes and leading space(s) + x(indj, indk) = regexp (dummy{indk}, '[^'' ].*[^'']', 'match'){1}; + catch + %# if the previous test fails, try a simpler one + in = regexp (dummy{indk}, '[^'' ]+', 'match'); + if (!isempty(in)) + x(indj, indk) = in{1}; + %# else + %# x(indj, indk) = []; + endif + end_try_catch + else + %# no conversion possible, store and remove leading space(s) + x(indj, indk) = regexp (dummy{indk}, '[^ ].*', 'match'); + endif + else + if (!isempty (regexp (dummy{indk}, '[/:]'))) + %# try to convert to a date + [timeval, nfields] = strptime( dummy{indk}, + [char(37) 'd/' char(37) 'm/' char(37) 'Y ' char(37) 'T']); + if (nfields > 0) %# at least a few fields are OK + timestr = strftime ([char(37) 'H:' char(37) 'M:' char(37) 'S'], + timeval); + %# try to extract the usec field, if any + idx = regexp (dummy{indk}, timestr, 'end'); + if (!isempty (idx)) + idx = idx + 1; + if (ispunct (dummy{indk}(idx))) + idx = idx + 1; + endif + timeval.usec = str2num(dummy{indk}(idx:end)); + endif + x(indj, indk) = str2num (strftime ([char(37) 's'], timeval)) + ... + timeval.usec * 1e-6; + endif + else + x(indj, indk) = the_line{indk}; + endif + endif + endfor + indl = indl + 1; indj = indj + 1; + endwhile + if (!isempty(empty_lines)) + x(empty_lines, :) = []; + endif + %# detect empty columns + empty_lines = find (0 == sum (cellfun ('size', x, 2))); + if (!isempty(empty_lines)) + x(:, empty_lines) = []; + endif + clear UTF8_BOM fid in lines indl the_line content empty_lines + clear timeval timestr nfields idx + endif + end_try_catch + endif + + %# fallback, avoiding a recursive call + idx.type = '()'; + if (!isa (x, 'char')) + indj = df._cnt(2)+(1:size (x, 2)); + else + %# at this point, reading some filename failed + error("dataframe: can't open '%s' for reading data", x); + endif; + + if (iscell(x)) + if (2 == length (x)) + %# use the intermediate value as destination column + [indc, ncol] = df_name2idx (df._name{2}, x{1}, df._cnt(2), "column"); + if (ncol != 1) + error (["With two-elements cell, the first should resolve " ... + "to a single column"]); + endif + try + dummy = cellfun ('class', x{2}(2, :), 'UniformOutput', false); + catch + dummy = cellfun ('class', x{2}(1, :), 'UniformOutput', false); + end_try_catch + df = df_pad (df, 2, [length(dummy) indc], dummy); + x = x{2}; + indj = indc + (1:size(x, 2)); %# redefine target range + else + if (isa (x{1}, 'cell')) + x = x{1}; %# remove one cell level + endif + endif + if (length (df._name{2}) < indj(1) || isempty (df._name{2}(indj))) + [df._name{2}(indj, 1), df._over{2}(1, indj)] ... + = df_colnames (inputname(indi), indj); + df._name{2} = genvarname (df._name{2}); + endif + %# allow overwriting of column names + df._over{2}(1, indj) = true; + else + if (!isempty(indj)) + if (1 == length (df._name{2}) && length (df._name{2}) < \ + length (indj)) + [df._name{2}(indj, 1), df._over{2}(1, indj)] ... + = df_colnames (char(df._name{2}), indj); + elseif (length (df._name{2}) < indj(1) || isempty (df._name{2}(indj))) + [df._name{2}(indj, 1), df._over{2}(1, indj)] ... + = df_colnames (inputname(indi), indj); + endif + df._name{2} = genvarname (df._name{2}); + endif + endif + if (!isempty (indj)) + %# the exact row size will be determined latter + idx.subs = {'', indj}; + %# use direct assignement + if (ndims (x) > 2), idx.subs{3} = 1:size (x, 3); endif + %# df = subsasgn(df, idx, x); <= call directly lower level + df = df_matassign (df, idx, indj, length(indj), x); + if (!isempty (cmt_lines)) + df._cmt = vertcat(df._cmt, cellstr(cmt_lines)); + cmt_lines = []; + endif + else + df._cnt(2) = length (df._name{2}); + endif + elseif (indi > 1) + error ('Concatenating dataframes: use cat instead'); + endif + + try + %# loop over next variable argument + x = varargin{1, indi}; + catch + %# disp('line 197 ???'); + end_try_catch + +endwhile + +endfunction + +function [x, y] = df_colnames(base, num) + %# small auxiliary function to generate column names. This is required + %# here, as only the constructor can use inputname() + if (any ([index(base, "=")])) + %# takes the left part as base + x = strsplit (base, "="); + x = deblank (x{1}); + if (isvarname (x)) + y = false; + else + x = 'X'; y = true; + endif + else + %# is base most probably a filename ? + x = regexp (base, '''[^''].*[^'']''', 'match'); + if (isempty (x)) + if (isvarname (base)) + x = base; y = false; + else + x = 'X'; y = true; %# this is a default value, may be changed + endif + else + x = x{1}; y = true; + endif + endif + + if (numel (num) > 1) + x = repmat (x, numel (num), 1); + x = cstrcat (x, strjust (num2str (num(:)), 'left')); + y = repmat (y, 1, numel (num)); + endif + + x = cellstr (x); + +endfunction