octave_packages/m/io/textscan.m

   1 ## Copyright (C) 2010-2012 Ben Abbott <bpabbott@mac.com>
   2 ##
   3 ## This file is part of Octave.
   4 ##
   5 ## Octave is free software; you can redistribute it and/or modify it
   6 ## under the terms of the GNU General Public License as published by
   7 ## the Free Software Foundation; either version 3 of the License, or (at
   8 ## your option) any later version.
   9 ##
  10 ## Octave is distributed in the hope that it will be useful, but
  11 ## WITHOUT ANY WARRANTY; without even the implied warranty of
  12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 ## General Public License for more details.
  14 ##
  15 ## You should have received a copy of the GNU General Public License
  16 ## along with Octave; see the file COPYING.  If not, see
  17 ## <http://www.gnu.org/licenses/>.
  18
  19 ## -*- texinfo -*-
  20 ## @deftypefn  {Function File} {@var{C} =} textscan (@var{fid}, @var{format})
  21 ## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{n})
  22 ## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{param}, @var{value}, @dots{})
  23 ## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{n}, @var{param}, @var{value}, @dots{})
  24 ## @deftypefnx {Function File} {@var{C} =} textscan (@var{str}, @dots{})
  25 ## @deftypefnx {Function File} {[@var{C}, @var{position}] =} textscan (@var{fid}, @dots{})
  26 ## Read data from a text file or string.
  27 ##
  28 ## The file associated with @var{fid} is read and parsed according to
  29 ## @var{format}.  The function behaves like @code{strread} except it works by
  30 ## parsing a file instead of a string.  See the documentation of
  31 ## @code{strread} for details.
  32 ##
  33 ## In addition to the options supported by
  34 ## @code{strread}, this function supports a few more:
  35 ##
  36 ## @itemize
  37 ## @item "collectoutput":
  38 ## A value of 1 or true instructs textscan to concatenate consecutive columns
  39 ## of the same class in the output cell array.  A value of 0 or false (default)
  40 ## leaves output in distinct columns.
  41 ##
  42 ## @item "endofline":
  43 ## Specify "\r", "\n" or "\r\n" (for CR, LF, or CRLF).  If no value is given,
  44 ## it will be inferred from the file.  If set to "" (empty string) EOLs are
  45 ## ignored as delimiters and added to whitespace.
  46 ##
  47 ## @item "headerlines":
  48 ## The first @var{value} number of lines of @var{fid} are skipped.
  49 ##
  50 ## @item "returnonerror":
  51 ## If set to numerical 1 or true (default), return normally when read errors
  52 ## have been encountered.  If set to 0 or false, return an error and no data.
  53 ## @end itemize
  54 ##
  55 ## The optional input @var{n} specifes the number of times to use
  56 ## @var{format} when parsing, i.e., the format repeat count.
  57 ##
  58 ## The output @var{C} is a cell array whose length is given by the number
  59 ## of format specifiers.
  60 ##
  61 ## The second output, @var{position}, provides the position, in characters,
  62 ## from the beginning of the file.
  63 ##
  64 ## @seealso{dlmread, fscanf, load, strread, textread}
  65 ## @end deftypefn
  66
  67 function [C, position] = textscan (fid, format = "%f", varargin)
  68
  69   ## Check input
  70   if (nargin < 1)
  71     print_usage ();
  72   endif
  73
  74   if (isempty (format))
  75     format = "%f";
  76   endif
  77
  78   if (! (isa (fid, "double") && fid > 0) && ! ischar (fid))
  79     error ("textscan: first argument must be a file id or character string");
  80   endif
  81
  82   if (! ischar (format))
  83     error ("textscan: FORMAT must be a string");
  84   endif
  85
  86   args = varargin;
  87   if (nargin > 2 && isnumeric (args{1}))
  88     nlines = args{1};
  89   else
  90     nlines = Inf;
  91   endif
  92
  93   if (! any (strcmpi (args, "emptyvalue")))
  94     ## Matlab returns NaNs for missing values
  95     args(end+1:end+2) = {'emptyvalue', NaN};
  96   endif
  97
  98   ## Check default parameter values that differ for strread & textread
  99
 100   ipos = find (strcmpi (args, "whitespace"));
 101   if (isempty (ipos))
 102     ## Matlab default whitespace = " \b\t"
 103     args(end+1:end+2) = {'whitespace', " \b\t"};
 104     whitespace = " \b\t";
 105   else
 106     ## Check if there's at least one string format specifier
 107     fmt = strrep (format, "%", " %");
 108     fmt = regexp (fmt, '[^ ]+', 'match');
 109     fmt = strtrim (fmt(strmatch ("%", fmt)))
 110     has_str_fmt = all (cellfun ("isempty", strfind (strtrim (fmt(strmatch ("%", fmt))), 's')));
 111     ## If there is a format, AND whitespace value = empty,
 112     ## don't add a space (char(32)) to whitespace
 113     if (! (isempty (args{ipos+1}) &&  has_str_fmt))
 114       args{ipos+1} = unique ([" ", whitespace]);
 115     endif
 116   endif
 117
 118   if (! any (strcmpi (args, "delimiter")))
 119     ## Matlab says default delimiter = whitespace.
 120     ## strread() will pick this up further
 121     args(end+1:end+2) = {'delimiter', ""};
 122   endif
 123
 124   collop = false;
 125   ipos = find (strcmpi (args, "collectoutput"));
 126   if (! isempty (ipos))
 127     ## Search & concatenate consecutive columns of same class requested
 128     if (isscalar (args{ipos+1})
 129         && (islogical (args{ipos+1}) || isnumeric (args{ipos+1})))
 130       collop = args{ipos+1};
 131     else
 132       warning ("textscan: illegal value for CollectOutput parameter - ignored");
 133     endif
 134     ## Remove argument before call to strread() below
 135     args(ipos:ipos+1) = [];
 136   endif
 137
 138   if (any (strcmpi (args, "returnonerror")))
 139     ## Because of the way strread() reads data (columnwise) this parameter
 140     ## can't be neatly implemented.  strread() will pick it up anyway
 141     warning ('textscan: ReturnOnError is not fully implemented');
 142   else
 143     ## Set default value (=true)
 144     args(end+1:end+2) = {"returnonerror", 1};
 145   endif
 146
 147   if (ischar (fid))
 148     ## Read from a text string
 149     if (nargout == 2)
 150       error ("textscan: cannot provide position information for character input");
 151     endif
 152     str = fid;
 153   else
 154     ## Skip header lines if requested
 155     headerlines = find (strcmpi (args, "headerlines"), 1);
 156     ## Beware of zero valued headerline, fskipl would skip to EOF
 157     if (! isempty (headerlines) && (args{headerlines + 1} > 0))
 158       fskipl (fid, varargin{headerlines + 1});
 159       args(headerlines:headerlines+1) = [];
 160     endif
 161     if (isfinite (nlines) && (nlines >= 0))
 162       str = tmp_str = "";
 163       n = 0;
 164       ## FIXME: Can this be done without slow loop?
 165       while (ischar (tmp_str) && n++ < nlines)
 166         tmp_str = fgets (fid);
 167         if (ischar (tmp_str))
 168           str = strcat (str, tmp_str);
 169         endif
 170       endwhile
 171     else
 172       str = fread (fid, "char=>char").';
 173     endif
 174   endif
 175
 176   ## Check for empty result
 177   if (isempty (str))
 178     warning ("textscan: no data read");
 179     C = [];
 180     return;
 181   endif
 182
 183   ## Check value of 'endofline'.  String or file doesn't seem to matter
 184   endofline = find (strcmpi (args, "endofline"), 1);
 185   if (! isempty (endofline))
 186     if (ischar (args{endofline + 1}))
 187       eol_char = args{endofline + 1};
 188       if (isempty (strmatch (eol_char, {"", "\n", "\r", "\r\n"}, 'exact')))
 189         error ("textscan: illegal EndOfLine character value specified");
 190       endif
 191     else
 192       error ("textscan: character value required for EndOfLine");
 193     endif
 194   else
 195     ## Determine EOL from file.  Search for EOL candidates in first 3000 chars
 196     eol_srch_len = min (length (str), 3000);
 197     ## First try DOS (CRLF)
 198     if (! isempty (findstr ("\r\n", str(1 : eol_srch_len))))
 199       eol_char = "\r\n";
 200     ## Perhaps old Macintosh? (CR)
 201     elseif (! isempty (findstr ("\r", str(1 : eol_srch_len))))
 202       eol_char = "\r";
 203     ## Otherwise, use plain UNIX (LF)
 204     else
 205       eol_char = "\n";
 206     endif
 207     ## Set up the default endofline param value
 208     args(end+1:end+2) = {'endofline', eol_char};
 209   endif
 210
 211   ## Determine the number of data fields
 212   num_fields = numel (strfind (format, "%")) - numel (strfind (format, "%*"));
 213
 214   ## Strip trailing EOL to avoid returning stray missing values (f. strread)
 215   if (strcmp (str(end-length (eol_char) + 1 : end), eol_char));
 216     str(end-length (eol_char) + 1 : end) = "";
 217   endif
 218
 219   ## Call strread to make it do the real work
 220   C = cell (1, num_fields);
 221   [C{:}] = strread (str, format, args{:});
 222
 223   ## If requested, collect output columns of same class
 224   if (collop)
 225     C = colloutp (C);
 226   endif
 227
 228   if (nargout == 2)
 229     position = ftell (fid);
 230   endif
 231
 232 endfunction
 233
 234
 235 ## Collect consecutive columns of same class into one cell column
 236 function C = colloutp (C)
 237
 238   ## Start at rightmost column and work backwards to avoid ptr mixup
 239   ii = numel (C);
 240   while ii > 1
 241     clss1 = class (C{ii});
 242     jj = ii;
 243     while  (jj > 1 && strcmp (clss1, class (C{jj - 1})))
 244       ## Column to the left is still same class; check next column to the left
 245       --jj;
 246     endwhile
 247     if (jj < ii)
 248       ## Concatenate columns into current column
 249       C{jj} = [C{jj : ii}];
 250       ## Wipe concatenated columns to the right, resume search to the left
 251       C(jj+1 : ii) = [];
 252       ii = jj - 1;
 253     else
 254       ## No similar class in column to the left, search from there
 255       --ii;
 256     endif
 257   endwhile
 258
 259 endfunction
 260
 261 %!test
 262 %! str = "1,  2,  3,  4\n 5,  ,  ,  8\n 9, 10, 11, 12";
 263 %! fmtstr = "%f %d %f %s";
 264 %! c = textscan (str, fmtstr, 2, "delimiter", ",", "emptyvalue", -Inf);
 265 %! assert (isequal (c{1}, [1;5]));
 266 %! assert (length (c{1}), 2);
 267 %! assert (iscellstr (c{4}));
 268 %! assert (isequal (c{3}, [3; -Inf]));
 269
 270 %!test
 271 %! b = [10:10:100];
 272 %! b = [b; 8*b/5];
 273 %! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b);
 274 %! fmt = "%f miles/hr = %f kilometers/hr";
 275 %! c = textscan (str, fmt);
 276 %! assert (b(1,:)', c{1}, 1e-5);
 277 %! assert (b(2,:)', c{2}, 1e-5);
 278
 279 #%!test
 280 #%! str = "13, 72, NA, str1, 25\r\n// Middle line\r\n36, na, 05, str3, 6";
 281 #%! a = textscan(str, '%d %n %f %s %n', 'delimiter', ',','treatAsEmpty', {'NA', 'na'},'commentStyle', '//');
 282 #%! assert (a{1}, int32([13; 36]));
 283 #%! assert (a{2}, [72; NaN]);
 284 #%! assert (a{3}, [NaN; 5]);
 285 #%! assert (a{4}, {"str1"; "str3"});
 286 #%! assert (a{5}, [25; 6]);
 287
 288 %!test
 289 %! str = "Km:10 = hhhBjjj miles16hour\r\n";
 290 %! str = [str "Km:15 = hhhJjjj miles241hour\r\n"];
 291 %! str = [str "Km:2 = hhhRjjj miles3hour\r\n"];
 292 %! str = [str "Km:25 = hhhZ\r\n"];
 293 %! fmt = "Km:%d = hhh%1sjjj miles%dhour";
 294 %! a = textscan (str, fmt, 'delimiter', ' ');
 295 %! assert (a{1}', int32([10 15 2 25]));
 296 %! assert (a{2}', {'B' 'J' 'R' 'Z'});
 297 %! assert (a{3}', int32([16 241 3 0]));
 298
 299 %% Test with default endofline parameter
 300 %!test
 301 %! c = textscan ("L1\nL2", "%s");
 302 %! assert (c{:}, {"L1"; "L2"});
 303
 304 %% Test with endofline parameter set to '' (empty) - newline should be in word
 305 %!test
 306 %! c = textscan ("L1\nL2", "%s", 'endofline', '');
 307 %! assert (int8(c{:}{:}), int8([ 76,  49,  10,  76,  50 ]));
 308
 309 %!test
 310 %! # No delimiters at all besides EOL.  Skip fields, even empty fields
 311 %! str = "Text1Text2Text\nTextText4Text\nText57Text";
 312 %! c = textscan (str, "Text%*dText%dText");
 313 %! assert (c{1}, int32 ([2; 4; 0]));
 314
 315 %!test
 316 %% CollectOutput test
 317 %! b = [10:10:100];
 318 %! b = [b; 8*b/5; 8*b*1000/5];
 319 %! str = sprintf ("%g miles/hr = %g (%g) kilometers (meters)/hr\n", b);
 320 %! fmt = "%f miles%s %s %f (%f) kilometers %*s";
 321 %! c = textscan (str, fmt, 'collectoutput', 1);
 322 %! assert (size(c{3}), [10, 2]);
 323 %! assert (size(c{2}), [10, 2]);
 324
 325 %% Test input validation
 326 %!error textscan ()
 327 %!error textscan (single (4))
 328 %!error textscan ({4})
 329 %!error <must be a string> textscan ("Hello World", 2)
 330 %!error <cannot provide position information> [C, pos] = textscan ("Hello World")
 331 %!error <character value required> textscan ("Hello World", '%s', 'EndOfLine', 3)
 332