octave_packages/dataframe-0.9.1/@dataframe/dataframe.m

   1 function df = dataframe(x = [], varargin)
   2
   3   %# -*- texinfo -*-
   4   %#  @deftypefn {Function File} @var{df} = dataframe(@var{x = []}, ...)
   5   %# This is the default constructor for a dataframe object, which is
   6   %# similar to R 'data.frame'. It's a way to group tabular data, then
   7   %# accessing them either as matrix or by column name.
   8   %# Input argument x may be: @itemize
   9   %# @item a dataframe => use @var{varargin} to pad it with suplemental
  10   %# columns
  11   %# @item a matrix => create column names from input name; each column
  12   %# is used as an entry
  13   %# @item a cell matrix => try to infer column names from the first row,
  14   %#   and row indexes and names from the two first columns;
  15   %# @item a file name => import data into a dataframe;
  16   %# @item a matrix of char => initialise colnames from them.
  17   %# @item a two-element cell: use the first as column as column to
  18   %# append to,  and the second as initialiser for the column(s)
  19   %# @end itemize
  20   %# If called with an empty value, or with the default argument, it
  21   %# returns an empty dataframe which can be further populated by
  22   %# assignement, cat, ... If called without any argument, it should
  23   %# return a dataframe from the whole workspace.
  24   %# @*Variable input arguments are first parsed as pairs (options, values).
  25   %# Recognised options are: @itemize
  26   %# @item rownames : take the values as initialiser for row names
  27   %# @item colnames : take the values as initialiser for column names
  28   %# @item seeked : a (kept) field value which triggers start of processing.
  29   %# @item trigger : a (unkept) field value which triggers start of processing.
  30   %# Each preceeding line is silently skipped. Default: none
  31   %# @item unquot: a logical switch telling wheter or not strings should
  32   %# be unquoted before storage, default = true;
  33   %# @item sep: the elements separator, default '\t,'
  34   %# @end itemize
  35   %# The remaining data are concatenated (right-appended) to the existing ones.
  36   %# @end deftypefn
  37
  38   %% Copyright (C) 2009-2012 Pascal Dupuis <Pascal.Dupuis@uclouvain.be>
  39   %%
  40   %% This file is part of Octave.
  41   %%
  42   %% Octave is free software; you can redistribute it and/or
  43   %% modify it under the terms of the GNU General Public
  44   %% License as published by the Free Software Foundation;
  45   %% either version 2, or (at your option) any later version.
  46   %%
  47   %% Octave is distributed in the hope that it will be useful,
  48   %% but WITHOUT ANY WARRANTY; without even the implied
  49   %% warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  50   %% PURPOSE.  See the GNU General Public License for more
  51   %% details.
  52   %%
  53   %% You should have received a copy of the GNU General Public
  54   %% License along with Octave; see the file COPYING.  If not,
  55   %% write to the Free Software Foundation, 51 Franklin Street -
  56   %% Fifth Floor, Boston, MA 02110-1301, USA.
  57
  58   %#
  59   %# $Id: dataframe.m 9585 2012-02-05 15:32:46Z cdemills $
  60   %#
  61
  62 if (0 == nargin)
  63   disp ('FIXME -- should create a dataframe from the whole workspace')
  64   df = dataframe ([]);
  65   return
  66 endif
  67
  68 if (isempty (x) && 1 == nargin)
  69   %# default constructor: initialise the fields in the right order
  70   df._cnt = [0 0];
  71   df._name = {cell(0, 1), cell(1, 0)}; %# rows - cols
  72   df._over = cell (1, 2);
  73   df._ridx = [];
  74   df._data = cell (0, 0);
  75   df._rep = cell (0, 0);   %# a repetition index
  76   df._type = cell (0, 0);  %# the type of each column
  77   df._src = cell (0, 0);
  78   df._cmt = cell (0, 0);   %# to put comments
  79   df = class (df, 'dataframe');
  80   return
  81 endif
  82
  83 if (isa (x, 'dataframe'))
  84   df = x;
  85 elseif (isa (x, 'struct'))
  86   df = class (x, 'dataframe'); return
  87 else
  88   df = dataframe ([]); %# get the right fields
  89 endif
  90
  91 %# default values
  92 seeked = []; trigger =[]; unquot = true; sep = "\t,"; cmt_lines = [];
  93 locales = "C";
  94
  95 if (length(varargin) > 0)
  96   indi = 1;
  97   %# loop over possible arguments
  98   while (indi <= size (varargin, 2))
  99     if (isa (varargin{indi}, 'char'))
 100       switch(varargin{indi})
 101         case 'rownames'
 102           switch class (varargin{indi+1})
 103             case {'cell'}
 104               df._name{1} = varargin{indi+1};
 105             case {'char'}
 106               df._name{1} = cellstr (varargin{indi+1});
 107             otherwise
 108               df._name{1} = cellstr (num2str (varargin{indi+1}));
 109           endswitch
 110           df._name{1} = genvarname (df._name{1});
 111           df._over{1}(1, 1:length (df._name{1})) = false;
 112           df._cnt(1) = size (df._name{1}, 1);
 113           df._ridx = (1:df._cnt(1))';
 114           varargin(indi:indi+1) = [];
 115         case 'colnames'
 116           switch class(varargin{indi+1})
 117             case {'cell'}
 118               df._name{2} = varargin{indi+1};
 119             case {'char'}
 120               df._name{2} = cellstr (varargin{indi+1});
 121             otherwise
 122               df._name{2} = cellstr (num2str (varargin{indi+1}));
 123           endswitch
 124           %# detect assignment - functions calls - ranges
 125           dummy = cellfun ('size', cellfun (@(x) strsplit (x, ":=("), df._name{2}, \
 126                                             "UniformOutput", false), 2);
 127           if (any(dummy > 1))
 128             warning('dataframe colnames taken literally and not interpreted');
 129           endif
 130           df._name{2} = genvarname (df._name{2});
 131           df._over{2}(1, 1:length (df._name{2})) = false;
 132           varargin(indi:indi+1) = [];
 133         case 'seeked',
 134           seeked = varargin{indi + 1};
 135           varargin(indi:indi+1) = [];
 136         case 'trigger',
 137           trigger = varargin{indi + 1};
 138           varargin(indi:indi+1) = [];
 139         case 'unquot',
 140           unquot = varargin{indi + 1};
 141           varargin(indi:indi+1) = [];
 142         case 'sep',
 143           sep = varargin{indi + 1};
 144           varargin(indi:indi+1) = [];
 145         case 'locales'
 146           locales = varargin{indi + 1};
 147           varargin(indi:indi+1) = [];
 148         otherwise %# FIXME: just skip it for now
 149           disp (sprintf ("Ignoring unkown argument %s", varargin{indi}));
 150           indi = indi + 1;
 151       endswitch
 152     else
 153       indi = indi + 1;    %# skip it
 154     endif
 155   endwhile
 156 endif
 157
 158 if (!isempty (seeked) && !isempty (trigger))
 159   error ('seeked and trigger are mutuallly incompatible arguments');
 160 endif
 161
 162 indi = 0;
 163 while (indi <= size(varargin, 2))
 164   indi = indi + 1;
 165   if (~isa (x, 'dataframe'))
 166     if (isa(x, 'char') && size(x, 1) < 2)
 167       %# read the data frame from a file
 168       try
 169         dummy = tilde_expand (x);
 170         x = load (dummy);
 171         df._src{end+1, 1} = dummy;
 172       catch
 173         %# try our own method
 174         UTF8_BOM = char([0xEF 0xBB 0xBF]);
 175         unwind_protect
 176           dummy = tilde_expand (x);
 177           fid = fopen (dummy);
 178           if (fid != -1)
 179             df._src{end+1, 1} = dummy;
 180             dummy = fgetl (fid);
 181             if (!strcmp (dummy, UTF8_BOM))
 182               frewind (fid);
 183             endif
 184             %# slurp everything and convert doubles to char, avoiding
 185             %# problems with char > 127
 186             in = char (fread (fid).');
 187           else
 188             in = [];
 189           endif
 190         unwind_protect_cleanup
 191           if (fid != -1) fclose (fid); endif
 192         end_unwind_protect
 193
 194         if (!isempty (in))
 195           %# explicit list taken from 'man pcrepattern' -- we enclose all
 196           %# vertical separators in case the underlying regexp engine
 197           %# doesn't have them all.
 198           eol = '(\r\n|\n|\v|\f|\r|\x85)';
 199           %# cut into lines -- include the EOL to have a one-to-one
 200           %# matching between line numbers. Use a non-greedy match.
 201           lines = regexp (in, ['.*?' eol], 'match');
 202           dummy = cellfun (@(x) regexp (x, eol), lines);
 203           %# remove the EOL character(s)
 204           lines(1 == dummy) = {""};
 205           %# use a positive lookahead -- eol is not part of the match
 206           lines(dummy > 1) = cellfun (@(x) regexp (x, ['.*?(?=' eol ')'], \
 207                                                    'match'), lines(dummy > 1));
 208           %# a field either starts at a word boundary, either by + - . for
 209           %# a numeric data, either by ' for a string.
 210
 211           %# content = cellfun(@(x) regexp(x, '(\b|[-+\.''])[^,]*(''|\b)', 'match'),\
 212           %# lines, 'UniformOutput', false); %# extract fields
 213           content = cellfun (@(x) strsplit (x, sep), lines, \
 214                              'UniformOutput', false); %# extract fields
 215           indl = 1; indj = 1; %# disp('line 151 '); keyboard
 216           if (~isempty (seeked))
 217             while (indl <= length (lines))
 218               dummy = content{indl};
 219               if (all (cellfun ('size', dummy, 2) == 0))
 220                 indl = indl + 1;
 221                 continue;
 222               endif
 223               dummy = content{indl};
 224               if (strcmp (dummy{1}, seeked))
 225                 break;
 226               endif
 227               indl = indl + 1;
 228             endwhile
 229           elseif (~isempty (trigger))
 230             while (indl <= length (lines))
 231               dummy = content{indl};
 232               indl = indl + 1;
 233               if (all (cellfun ('size', dummy, 2) == 0))
 234                 continue;
 235               endif
 236               if (strcmp (dummy{1}, trigger))
 237                 break;
 238               endif
 239             endwhile
 240           endif
 241           x = cell (1+length (lines)-indl, size(dummy, 2));
 242           empty_lines = []; cmt_lines = [];
 243           while (indl <= length(lines))
 244             dummy = content{indl};
 245             if (all (cellfun ('size', dummy, 2) == 0))
 246               empty_lines = [empty_lines indj];
 247               indl = indl + 1; indj = indj + 1;
 248               continue;
 249             endif
 250             %# does it looks like a comment line ?
 251             if (regexp (dummy{1}, ['^\s*' char(35)]))
 252               empty_lines = [empty_lines indj];
 253               cmt_lines = strvcat (cmt_lines, horzcat (dummy{:}));
 254               indl = indl + 1; indj = indj + 1;
 255               continue;
 256             endif
 257             %# try to convert to float
 258             the_line = cellfun (@(x) sscanf (x, "%f", locales), dummy, \
 259                                 'UniformOutput', false);
 260             for indk = (1:size (the_line, 2))
 261               if (isempty (the_line{indk}) || any (size (the_line{indk}) > 1))
 262                 %#if indi > 1 && indk > 1, disp('line 117 '); keyboard; %#endif
 263                 if (unquot)
 264                   try
 265                     %# remove quotes and leading space(s)
 266                     x(indj, indk) = regexp (dummy{indk}, '[^'' ].*[^'']', 'match'){1};
 267                   catch
 268                     %# if the previous test fails, try a simpler one
 269                     in = regexp (dummy{indk}, '[^'' ]+', 'match');
 270                     if (!isempty(in))
 271                       x(indj, indk) = in{1};
 272                       %# else
 273                       %#    x(indj, indk) = [];
 274                     endif
 275                   end_try_catch
 276                 else
 277                   %# no conversion possible, store and remove leading space(s)
 278                   x(indj, indk) = regexp (dummy{indk}, '[^ ].*', 'match');
 279                 endif
 280               else
 281                 if (!isempty (regexp (dummy{indk}, '[/:]')))
 282                   %# try to convert to a date
 283                   [timeval, nfields] = strptime( dummy{indk},
 284                                                 [char(37) 'd/' char(37) 'm/' char(37) 'Y ' char(37) 'T']);
 285                   if (nfields > 0) %# at least a few fields are OK
 286                     timestr =  strftime ([char(37) 'H:' char(37) 'M:' char(37) 'S'],
 287                                          timeval);
 288                     %# try to extract the usec field, if any
 289                     idx = regexp (dummy{indk}, timestr, 'end');
 290                     if (!isempty (idx))
 291                       idx = idx + 1;
 292                       if (ispunct (dummy{indk}(idx)))
 293                         idx = idx + 1;
 294                       endif
 295                       timeval.usec = str2num(dummy{indk}(idx:end));
 296                     endif
 297                     x(indj, indk) =  str2num (strftime ([char(37) 's'], timeval)) + ...
 298                         timeval.usec * 1e-6;
 299                   endif
 300                 else
 301                   x(indj, indk) = the_line{indk};
 302                 endif
 303               endif
 304             endfor
 305             indl = indl + 1; indj = indj + 1;
 306           endwhile
 307           if (!isempty(empty_lines))
 308             x(empty_lines, :) = [];
 309           endif
 310           %# detect empty columns
 311           empty_lines = find (0 == sum (cellfun ('size', x, 2)));
 312           if (!isempty(empty_lines))
 313             x(:, empty_lines) = [];
 314           endif
 315           clear UTF8_BOM fid in lines indl the_line content empty_lines
 316           clear timeval timestr nfields idx
 317         endif
 318       end_try_catch
 319     endif
 320
 321     %# fallback, avoiding a recursive call
 322     idx.type = '()';
 323     if (!isa (x, 'char'))
 324       indj = df._cnt(2)+(1:size (x, 2));
 325     else
 326       %# at this point, reading some filename failed
 327       error("dataframe: can't open '%s' for reading data", x);
 328     endif;
 329
 330     if (iscell(x))
 331       if (2 == length (x))
 332         %# use the intermediate value as destination column
 333         [indc, ncol] = df_name2idx (df._name{2}, x{1}, df._cnt(2), "column");
 334         if (ncol != 1)
 335           error (["With two-elements cell, the first should resolve " ...
 336                   "to a single column"]);
 337         endif
 338         try
 339           dummy = cellfun ('class', x{2}(2, :), 'UniformOutput', false);
 340         catch
 341           dummy = cellfun ('class', x{2}(1, :), 'UniformOutput', false);
 342         end_try_catch
 343         df = df_pad (df, 2, [length(dummy) indc], dummy);
 344         x = x{2};
 345         indj =  indc + (1:size(x, 2));  %# redefine target range
 346       else
 347         if (isa (x{1}, 'cell'))
 348           x = x{1}; %# remove one cell level
 349         endif
 350       endif
 351       if (length (df._name{2}) < indj(1) || isempty (df._name{2}(indj)))
 352         [df._name{2}(indj, 1),  df._over{2}(1, indj)] ...
 353             = df_colnames (inputname(indi), indj);
 354         df._name{2} = genvarname (df._name{2});
 355       endif
 356       %# allow overwriting of column names
 357       df._over{2}(1, indj) = true;
 358     else
 359       if (!isempty(indj))
 360         if (1 == length (df._name{2}) && length (df._name{2}) < \
 361             length (indj))
 362           [df._name{2}(indj, 1),  df._over{2}(1, indj)] ...
 363               = df_colnames (char(df._name{2}), indj);
 364         elseif (length (df._name{2}) < indj(1) || isempty (df._name{2}(indj)))
 365           [df._name{2}(indj, 1),  df._over{2}(1, indj)] ...
 366               = df_colnames (inputname(indi), indj);
 367         endif
 368         df._name{2} = genvarname (df._name{2});
 369       endif
 370     endif
 371     if (!isempty (indj))
 372       %# the exact row size will be determined latter
 373       idx.subs = {'', indj};
 374       %# use direct assignement
 375       if (ndims (x) > 2), idx.subs{3} = 1:size (x, 3); endif
 376       %#      df = subsasgn(df, idx, x);        <= call directly lower level
 377       df = df_matassign (df, idx, indj, length(indj), x);
 378       if (!isempty (cmt_lines))
 379         df._cmt = vertcat(df._cmt, cellstr(cmt_lines));
 380         cmt_lines = [];
 381       endif
 382     else
 383       df._cnt(2) = length (df._name{2});
 384     endif
 385   elseif (indi > 1)
 386     error ('Concatenating dataframes: use cat instead');
 387   endif
 388
 389   try
 390     %# loop over next variable argument
 391     x = varargin{1, indi};
 392   catch
 393     %#   disp('line 197 ???');
 394   end_try_catch
 395
 396 endwhile
 397
 398 endfunction
 399
 400 function [x, y] = df_colnames(base, num)
 401   %# small auxiliary function to generate column names. This is required
 402   %# here, as only the constructor can use inputname()
 403   if (any ([index(base, "=")]))
 404     %# takes the left part as base
 405     x = strsplit (base, "=");
 406     x = deblank (x{1});
 407     if (isvarname (x))
 408       y = false;
 409     else
 410       x = 'X'; y = true;
 411     endif
 412   else
 413     %# is base most probably a filename ?
 414     x =  regexp (base, '''[^''].*[^'']''', 'match');
 415     if (isempty (x))
 416       if (isvarname (base))
 417         x = base; y = false;
 418       else
 419         x = 'X'; y = true; %# this is a default value, may be changed
 420       endif
 421     else
 422       x = x{1}; y = true;
 423     endif
 424   endif
 425
 426   if (numel (num) > 1)
 427     x = repmat (x, numel (num), 1);
 428     x = cstrcat (x, strjust (num2str (num(:)), 'left'));
 429     y = repmat (y, 1, numel (num));
 430   endif
 431
 432   x = cellstr (x);
 433
 434 endfunction