octave_packages/m/io/strread.m

   1 ## Copyright (C) 2009-2012 Eric Chassande-Mottin, CNRS (France)
   2 ## Copyright (C) 2012 Philip Nienhuis
   3 ##
   4 ## This file is part of Octave.
   5 ##
   6 ## Octave is free software; you can redistribute it and/or modify it
   7 ## under the terms of the GNU General Public License as published by
   8 ## the Free Software Foundation; either version 3 of the License, or (at
   9 ## your option) any later version.
  10 ##
  11 ## Octave is distributed in the hope that it will be useful, but
  12 ## WITHOUT ANY WARRANTY; without even the implied warranty of
  13 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 ## General Public License for more details.
  15 ##
  16 ## You should have received a copy of the GNU General Public License
  17 ## along with Octave; see the file COPYING.  If not, see
  18 ## <http://www.gnu.org/licenses/>.
  19
  20 ## -*- texinfo -*-
  21 ## @deftypefn  {Function File} {[@var{a}, @dots{}] =} strread (@var{str})
  22 ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format})
  23 ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat})
  24 ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{prop1}, @var{value1}, @dots{})
  25 ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat}, @var{prop1}, @var{value1}, @dots{})
  26 ## Read data from a string.
  27 ##
  28 ## The string @var{str} is split into words that are repeatedly matched to the
  29 ## specifiers in @var{format}.  The first word is matched to the first
  30 ## specifier, the second to the second specifier and so forth.  If there are
  31 ## more words than specifiers, the process is repeated until all words have
  32 ## been processed.
  33 ##
  34 ## The string @var{format} describes how the words in @var{str} should be
  35 ## parsed.
  36 ## It may contain any combination of the following specifiers:
  37 ##
  38 ## @table @code
  39 ## @item %s
  40 ## The word is parsed as a string.
  41 ##
  42 ## @itemx %f
  43 ## @itemx %n
  44 ## The word is parsed as a number and converted to double.
  45 ##
  46 ## @item  %d
  47 ## @itemx %u
  48 ## The word is parsed as a number and converted to int32.
  49 ##
  50 ## @item %*', '%*f', '%*s
  51 ## The word is skipped.
  52 ##
  53 ## For %s and %d, %f, %n, %u and the associated %*s @dots{} specifiers an
  54 ## optional width can be specified as %Ns, etc. where N is an integer > 1.
  55 ## For %f, format specifiers like %N.Mf are allowed.
  56 ##
  57 ## @item literals
  58 ## In addition the format may contain literal character strings; these will be
  59 ## skipped during reading.
  60 ## @end table
  61 ##
  62 ## Parsed word corresponding to the first specifier are returned in the first
  63 ## output argument and likewise for the rest of the specifiers.
  64 ##
  65 ## By default, @var{format} is @t{"%f"}, meaning that numbers are read from
  66 ## @var{str}.  This will do if @var{str} contains only numeric fields.
  67 ##
  68 ## For example, the string
  69 ##
  70 ## @example
  71 ## @group
  72 ## @var{str} = "\
  73 ## Bunny Bugs   5.5\n\
  74 ## Duck Daffy  -7.5e-5\n\
  75 ## Penguin Tux   6"
  76 ## @end group
  77 ## @end example
  78 ##
  79 ## @noindent
  80 ## can be read using
  81 ##
  82 ## @example
  83 ## [@var{a}, @var{b}, @var{c}] = strread (@var{str}, "%s %s %f");
  84 ## @end example
  85 ##
  86 ## Optional numeric argument @var{format_repeat} can be used for
  87 ## limiting the number of items read:
  88 ##
  89 ## @table @asis
  90 ## @item -1
  91 ## (default) read all of the string until the end.
  92 ##
  93 ## @item N
  94 ## Read N times @var{nargout} items.  0 (zero) is an acceptable
  95 ## value for @var{format_repeat}.
  96 ## @end table
  97 ##
  98 ## The behavior of @code{strread} can be changed via property-value
  99 ## pairs.  The following properties are recognized:
 100 ##
 101 ## @table @asis
 102 ## @item "commentstyle"
 103 ## Parts of @var{str} are considered comments and will be skipped.
 104 ## @var{value} is the comment style and can be any of the following.
 105 ## @itemize
 106 ## @item "shell"
 107 ## Everything from @code{#} characters to the nearest end-of-line is skipped.
 108 ##
 109 ## @item "c"
 110 ## Everything between @code{/*} and @code{*/} is skipped.
 111 ##
 112 ## @item "c++"
 113 ## Everything from @code{//} characters to the nearest end-of-line is skipped.
 114 ##
 115 ## @item "matlab"
 116 ## Everything from @code{%} characters to the nearest end-of-line is skipped.
 117 ##
 118 ## @item user-supplied.  Two options:
 119 ## (1) One string, or 1x1 cell string: Skip everything to the right of it;
 120 ## (2) 2x1 cell string array: Everything between the left and right strings
 121 ## is skipped.
 122 ## @end itemize
 123 ##
 124 ## @item "delimiter"
 125 ## Any character in @var{value} will be used to split @var{str} into words
 126 ## (default value = any whitespace).
 127 ##
 128 ## @item "emptyvalue":
 129 ## Value to return for empty numeric values in non-whitespace delimited data.
 130 ## The default is NaN@.  When the data type does not support NaN
 131 ## (int32 for example), then default is zero.
 132 ##
 133 ## @item "multipledelimsasone"
 134 ## Treat a series of consecutive delimiters, without whitespace in between,
 135 ## as a single delimiter.  Consecutive delimiter series need not be vertically
 136 ## "aligned".
 137 ##
 138 ## @item "treatasempty"
 139 ## Treat single occurrences (surrounded by delimiters or whitespace) of the
 140 ## string(s) in @var{value} as missing values.
 141 ##
 142 ## @item "returnonerror"
 143 ## If @var{value} true (1, default), ignore read errors and return normally.
 144 ## If false (0), return an error.
 145 ##
 146 ## @item "whitespace"
 147 ## Any character in @var{value} will be interpreted as whitespace and
 148 ## trimmed; the string defining whitespace must be enclosed in double
 149 ## quotes for proper processing of special characters like \t.
 150 ## The default value for whitespace = " \b\r\n\t" (note the space).
 151 ## Unless whitespace is set to '' (empty) AND at least one "%s" format
 152 ## conversion specifier is supplied, a space is always part of whitespace.
 153 ##
 154 ## @end table
 155 ##
 156 ## @seealso{textscan, textread, load, dlmread, fscanf}
 157 ## @end deftypefn
 158
 159 function varargout = strread (str, format = "%f", varargin)
 160
 161   ## Check input
 162   if (nargin < 1)
 163     print_usage ();
 164   endif
 165
 166   if (isempty (format))
 167     format = "%f";
 168   endif
 169
 170   if (! ischar (str) || ! ischar (format))
 171     error ("strread: STR and FORMAT arguments must be strings");
 172   endif
 173
 174   ## Parse format string to compare number of conversion fields and nargout
 175   nfields = length (strfind (format, "%")) - length (strfind (format, "%*"));
 176   ## If str only has numeric fields, a (default) format ("%f") will do.
 177   ## Otherwise:
 178   if ((max (nargout, 1) != nfields) && ! strcmp (format, "%f"))
 179     error ("strread: the number of output variables must match that specified by FORMAT");
 180   endif
 181
 182   ## Check for format string repeat count
 183   format_repeat_count = -1;
 184   if (nargin > 2 && isnumeric (varargin{1}))
 185     if (varargin{1} >= 0)
 186       format_repeat_count = varargin{1};
 187     endif
 188     if (nargin > 3)
 189       varargin = varargin(2:end);
 190     else
 191       varargin = {};
 192     endif
 193   endif
 194
 195   ## Parse options.  First initialize defaults
 196   comment_flag = false;
 197   delimiter_str = "";
 198   empty_str = "";
 199   eol_char = "";
 200   err_action = 0;
 201   mult_dlms_s1 = false;
 202   numeric_fill_value = NaN;
 203   white_spaces = " \b\r\n\t";
 204   for n = 1:2:length (varargin)
 205     switch (lower (varargin{n}))
 206       case "bufsize"
 207         ## We could synthesize this, but that just seems weird...
 208         warning ('strread: property "bufsize" is not implemented');
 209       case "commentstyle"
 210         comment_flag = true;
 211         switch (lower (varargin{n+1}))
 212           case "c"
 213             [comment_start, comment_end] = deal ("/*", "*/");
 214           case "c++"
 215             [comment_start, comment_end] = deal ("//", "eol_char");
 216           case "shell"
 217             [comment_start, comment_end] = deal ("#" , "eol_char");
 218           case "matlab"
 219             [comment_start, comment_end] = deal ("%" , "eol_char");
 220           otherwise
 221             if (ischar (varargin{n+1}) ||
 222                (numel (varargin{n+1}) == 1 && iscellstr (varargin{n+1})))
 223               [comment_start, comment_end] = deal (char (varargin{n+1}), "eol_char");
 224             elseif (iscellstr (varargin{n+1}) && numel (varargin{n+1}) == 2)
 225               [comment_start, comment_end] = deal (varargin{n+1}{:});
 226             else
 227               ## FIXME - a user may have numeric values specified: {'//', 7}
 228               ##         this will lead to an error in the warning message
 229               error ("strread: unknown or unrecognized comment style '%s'",
 230                       varargin{n+1});
 231             endif
 232         endswitch
 233       case "delimiter"
 234         delimiter_str = varargin{n+1};
 235         if (strcmp (typeinfo (delimiter_str), "sq_string"))
 236           delimiter_str = do_string_escapes (delimiter_str);
 237         endif
 238       case "emptyvalue"
 239         numeric_fill_value = varargin{n+1};
 240       case "expchars"
 241         warning ('strread: property "expchars" is not implemented');
 242       case "whitespace"
 243         white_spaces = varargin{n+1};
 244         if (strcmp (typeinfo (white_spaces), "sq_string"))
 245           white_spaces = do_string_escapes (white_spaces);
 246         endif
 247       ## The following parameters are specific to textscan and textread
 248       case "endofline"
 249         eol_char = varargin{n+1};
 250         if (strcmp (typeinfo (eol_char), "sq_string"))
 251           eol_char = do_string_escapes (eol_char);
 252         endif
 253       case "returnonerror"
 254         err_action = varargin{n+1};
 255       case "multipledelimsasone"
 256         mult_dlms_s1 = varargin{n+1};
 257       case "treatasempty"
 258         if (iscellstr (varargin{n+1}))
 259           empty_str = varargin{n+1};
 260         elseif (ischar (varargin{n+1}))
 261           empty_str = varargin(n+1);
 262         else
 263           error ('strread: "treatasempty" value must be string or cellstr');
 264         endif
 265       otherwise
 266         warning ('strread: unknown property "%s"', varargin{n});
 267     endswitch
 268   endfor
 269
 270   ## First parse of FORMAT
 271   if (strcmpi (strtrim (format), "%f"))
 272     ## Default format specified.  Expand it (to desired nargout)
 273     fmt_words = cell (nargout, 1);
 274     fmt_words (1:nargout) = format;
 275   else
 276     ## Determine the number of words per line as a first guess.  Forms
 277     ## like %f<literal>) (w/o delimiter in between) are fixed further on
 278     format = strrep (format, "%", " %");
 279     fmt_words = regexp (format, '[^ ]+', 'match');
 280     ## Format conversion specifiers following literals w/o space/delim
 281     ## in between are separate now.  Separate those w trailing literals
 282     idy2 = find (! cellfun ("isempty", strfind (fmt_words, "%")));
 283     a = strfind (fmt_words(idy2), "%");
 284     b = regexp (fmt_words(idy2), '[nfdus]', 'end');
 285     for jj = 1:numel (a)
 286       ii = numel (a) - jj + 1;
 287       if (! (length (fmt_words{idy2(ii)}) == b{ii}(1)))
 288         ## Fix format_words
 289         fmt_words(idy2(ii)+1 : end+1) = fmt_words(idy2(ii) : end);
 290         fmt_words{idy2(ii)} = fmt_words{idy2(ii)}(a{ii} : b{ii}(1));
 291         fmt_words{idy2(ii)+1} = fmt_words{idy2(ii)+1}(b{ii}+1:end);
 292       endif
 293     endfor
 294   endif
 295   num_words_per_line = numel (fmt_words);
 296
 297   ## Special handling for CRLF EOL character in str
 298   if (! isempty (eol_char) && strcmp (eol_char, "\r\n"))
 299     ## Strip CR from CRLF sequences
 300     str = strrep (str, "\r\n", "\n");
 301     ## CR serves no further purpose in function
 302     eol_char = "\n";
 303   endif
 304
 305   ## Remove comments in str
 306   if (comment_flag)
 307     ## Expand 'eol_char' here, after option processing which may have set value
 308     comment_end = regexprep (comment_end, 'eol_char', eol_char);
 309     cstart = strfind (str, comment_start);
 310     cstop  = strfind (str, comment_end);
 311     ## Treat end of string as additional comment stop
 312     if (isempty (cstop) || cstop(end) != length (str))
 313       cstop(end+1) = length (str);
 314     endif
 315     if (! isempty (cstart))
 316       ## Ignore nested openers.
 317       [idx, cidx] = unique (lookup (cstop, cstart), "first");
 318       if (idx(end) == length (cstop))
 319         cidx(end) = []; # Drop the last one if orphaned.
 320       endif
 321       cstart = cstart(cidx);
 322     endif
 323     if (! isempty (cstop))
 324       ## Ignore nested closers.
 325       [idx, cidx] = unique (lookup (cstart, cstop), "first");
 326       if (idx(1) == 0)
 327         cidx(1) = []; # Drop the first one if orphaned.
 328       endif
 329       cstop = cstop(cidx);
 330     endif
 331     len = length (str);
 332     c2len = length (comment_end);
 333     str = cellslices (str, [1, cstop + c2len], [cstart - 1, len]);
 334     str = [str{:}];
 335   endif
 336
 337   if (! isempty (white_spaces))
 338     ## For numeric fields, whitespace is always a delimiter, but not for text fields
 339     if (isempty (strfind (format, "%s")))
 340       ## Add whitespace to delimiter set
 341       delimiter_str = unique ([white_spaces delimiter_str]);
 342     else
 343       ## Remove any delimiter chars from white_spaces list
 344       white_spaces = setdiff (white_spaces, delimiter_str);
 345     endif
 346   endif
 347   if (isempty (delimiter_str))
 348     delimiter_str = " ";
 349   endif
 350   if (! isempty (eol_char))
 351     ## Add eol_char to delimiter collection
 352     delimiter_str = unique ([delimiter_str eol_char]);
 353     ## .. and remove it from whitespace collection
 354     white_spaces = strrep (white_spaces, eol_char, '');
 355   endif
 356
 357   pad_out = 0;
 358   ## Trim whitespace if needed
 359   if (! isempty (white_spaces))
 360     ## Check if trailing "\n" might signal padding output arrays to equal size
 361     ## before it is trimmed away below
 362     if ((str(end) == 10) && (nargout > 1))
 363       pad_out = 1;
 364     endif
 365     ## Condense all repeated whitespace into one single space
 366     ## FIXME: this will also fold repeated whitespace in a char field
 367     rxp_wsp = sprintf ("[%s]+", white_spaces);
 368     str = regexprep (str, rxp_wsp, ' ');
 369     ## Remove possible leading space at string
 370     if (str(1) == 32)
 371        str = str(2:end);
 372     endif
 373     ## Check for single delimiter followed/preceded by whitespace
 374     ## FIXME: Double strrep on str is enormously expensive of CPU time.
 375     ## Can this be eliminated
 376     if (! isempty (delimiter_str))
 377       dlmstr = setdiff (delimiter_str, " ");
 378       rxp_dlmwsp = sprintf ("( [%s]|[%s] )", dlmstr, dlmstr);
 379       str = regexprep (str, rxp_dlmwsp, delimiter_str(1));
 380     endif
 381     ## FIXME: Double strrep on str is enormously expensive of CPU time.
 382     ## Can this be eliminated
 383     ## Wipe leading and trailing whitespace on each line (it may be delimiter too)
 384     if (! isempty (eol_char))
 385       str = strrep (str, [eol_char " "], eol_char);
 386       str = strrep (str, [" " eol_char], eol_char);
 387     endif
 388   endif
 389
 390   ## Split 'str' into words
 391   words = split_by (str, delimiter_str, mult_dlms_s1, eol_char);
 392   if (! isempty (white_spaces))
 393     ## Trim leading and trailing white_spaces
 394     ## FIXME: Is this correct?  strtrim clears what matches isspace(), not
 395     ## necessarily what is in white_spaces.
 396     words = strtrim (words);
 397   endif
 398   num_words = numel (words);
 399   ## First guess at number of lines in file (ignoring leading/trailing literals)
 400   num_lines = ceil (num_words / num_words_per_line);
 401
 402   ## Replace TreatAsEmpty char sequences by empty strings
 403   if (! isempty (empty_str))
 404     for ii = 1:numel (empty_str)
 405       idz = strmatch (empty_str{ii}, words, "exact");
 406       words(idz) = {""};
 407     endfor
 408   endif
 409
 410   ## fmt_words has been split properly now, but words{} has only been split on
 411   ## delimiter positions.
 412   ## As numeric fields can also be separated by whitespace, more splits may be
 413   ## needed.
 414   ## We also don't know the number of lines (as EndOfLine may have been set to
 415   ## "" (empty) by the caller).
 416   ##
 417   ## We also may have to cope with 3 cases as far as literals go:
 418   ## A: Trailing literals (%f<literal>) w/o delimiter in between.
 419   ## B: Leading literals (<literal>%f) w/o delimiter in between.
 420   ## C. Skipping leftover parts of specified skip fields (%*N )
 421   ## Some words columns may have to be split further to fix these.
 422
 423   ## Find indices and pointers to possible literals in fmt_words
 424   idf = cellfun ("isempty", strfind (fmt_words, "%"));
 425   ## Find indices and pointers to conversion specifiers with fixed width
 426   idg = ! cellfun ("isempty", regexp (fmt_words, '%\*?\d'));
 427   idy = find (idf | idg);
 428   ## Find indices to numeric conversion specifiers
 429   idn = ! cellfun ("isempty", regexp (fmt_words, "%[dnfu]"));
 430
 431   ## If needed, split up columns in three steps:
 432   if (! isempty (idy))
 433     ## Try-catch because complexity of strings to read can be infinite
 434     try
 435
 436       ## 1. Assess "period" in the split-up words array ( < num_words_per_line).
 437       ## Could be done using EndOfLine but that prohibits EndOfLine = "" option.
 438       ## Alternative below goes by simply parsing a first grab of words
 439       ## and counting words until the fmt_words array is exhausted:
 440       iwrd = 1; iwrdp = 0; iwrdl = length (words{iwrd});
 441       for ii = 1:numel (fmt_words)
 442
 443         nxt_wrd = 0;
 444
 445         if (idf(ii))
 446           ## Literal expected
 447           if (isempty (strfind (fmt_words{ii}, words(iwrd))))
 448             ## Not found in current word; supposed to be in next word
 449             nxt_wrd = 1;
 450           else
 451             ## Found it in current word.  Subtract literal length
 452             iwrdp += length (fmt_words{ii});
 453             if (iwrdp > iwrdl)
 454               ## Parse error.  Literal extends beyond delimiter (word boundary)
 455               warning ("strread: literal '%s' (fmt spec # %d) does not match data", ...
 456                 fmt_words{ii}, ii);
 457               ## Word assumed to be completely "used up". Next word
 458               nxt_wrd = 1;
 459             elseif (iwrdp == iwrdl)
 460               ## Word completely "used up". Next word
 461               nxt_wrd = 1;
 462             endif
 463           endif
 464
 465         elseif (idg(ii))
 466           ## Fixed width specifier (%N or %*N): read just a part of word
 467           iwrdp += floor ...
 468            (str2double (fmt_words{ii}(regexp(fmt_words{ii}, '\d') : end-1)));
 469           if (iwrdp > iwrdl)
 470             ## Match error. Field extends beyond word boundary.
 471             warning  ...
 472             ("strread: field width '%s' (fmt spec # %d) extends beyond actual word limit", ...
 473                fmt_words{ii}, ii);
 474             ## Assume word to be completely "used up".  Next word
 475             nxt_wrd = 1;
 476           elseif (iwrdp == iwrdl)
 477             ## Word completely "used up".  Next word
 478             nxt_wrd = 1;
 479           endif
 480
 481         else
 482           ## A simple format conv. specifier. Either (1) uses rest of word, or
 483           ## (2) is squeezed between current iwrdp and next literal, or (3) uses
 484           ## next word. (3) is already taken care of.  So just check (1) & (2)
 485           if (ii < numel (fmt_words) && idf(ii+1))
 486             ## Next fmt_word is a literal...
 487             if (! index (words{iwrd}(iwrdp+1:end), fmt_words{ii+1}))
 488               ## ...but not found in current word => field uses rest of word
 489               nxt_wrd = 1;
 490             else
 491               ## ..or it IS found.  Add inferred width of current conversion field
 492               iwrdp += index (words{iwrd}(iwrdp+1:end), fmt_words{ii+1}) - 1;
 493             endif
 494           elseif (iwrdp < iwrdl)
 495             ## No bordering literal to the right => field occupies (rest of) word
 496             nxt_wrd = 1;
 497           endif
 498
 499         endif
 500
 501         if (nxt_wrd)
 502           ++iwrd; iwrdp = 0;
 503           if (ii < numel (fmt_words))
 504             iwrdl = length (words{iwrd});
 505           endif
 506         endif
 507
 508       endfor
 509       ## Done
 510       words_period = max (iwrd - 1, 1);
 511       num_lines = ceil (num_words / words_period);
 512
 513       ## 2. Pad words array so that it can be reshaped
 514       tmp_lines = ceil (num_words / words_period);
 515       num_words_padded = tmp_lines * words_period - num_words;
 516       if (num_words_padded)
 517         words = [words'; cell(num_words_padded, 1)];
 518       endif
 519       words = reshape (words, words_period, tmp_lines);
 520
 521       ## 3. Do the column splitting on rectangular words array
 522       icol = 1; ii = 1;    # icol = current column, ii = current fmt_word
 523       while (ii <= num_words_per_line)
 524
 525         ## Check if fmt_words(ii) contains a literal or fixed-width
 526         if ((idf(ii) || idg(ii)) && (rows(words) < num_words_per_line))
 527           if (idf(ii))
 528             s = strfind (words(icol, 1), fmt_words{ii});
 529             if (isempty (s{:}))
 530               error ("strread: Literal '%s' not found in column %d", fmt_words{ii}, icol);
 531             endif
 532             s = s{:}(1);
 533             e = s(1) + length (fmt_words{ii}) - 1;
 534           endif
 535           if (! strcmp (fmt_words{ii}, words{icol, 1}))
 536             ## Column doesn't exactly match literal => split needed.  Insert a column
 537             words(icol+1:end+1, :) = words(icol:end, :);
 538             ## Watch out for empty cells
 539             jptr = find (! cellfun ("isempty", words(icol, :)));
 540
 541             ## Distinguish leading or trailing literals
 542             if (! idg(ii) && ! isempty (s) && s(1) == 1)
 543               ## Leading literal.  Assign literal to icol, paste rest in icol + 1
 544               ## Apply only to those cells that do have something beyond literal
 545               jptr = find (cellfun("length", words(icol+1, jptr), ...
 546                             "UniformOutput", false) > e(1));
 547               words(icol+1, :) = {""};
 548               words(icol+1, jptr) = cellfun ...
 549                 (@(x) substr(x, e(1)+1, length(x)-e(1)), words(icol, jptr), ...
 550                 "UniformOutput", false);
 551               words(icol, jptr) = fmt_words{ii};
 552
 553             else
 554               if (! idg(ii) && ! isempty (strfind (fmt_words{ii-1}, "%s")))
 555                 ## Trailing literal.  If preceding format == '%s' this is an error
 556                 warning ("Ambiguous '%s' specifier next to literal in column %d", icol);
 557               elseif (idg(ii))
 558                 ## Current field = fixed width. Strip into icol, rest in icol+1
 559                 wdth = floor (str2double (fmt_words{ii}(regexp(fmt_words{ii}, ...
 560                               '\d') : end-1)));
 561                 words(icol+1, jptr) = cellfun (@(x) x(wdth+1:end),
 562                      words(icol,jptr), "UniformOutput", false);
 563                 words(icol, jptr) = strtrunc (words(icol, jptr), wdth);
 564               else
 565                 ## FIXME: this assumes char(254)/char(255) won't occur in input!
 566                 clear wrds;
 567                 wrds(1:2:2*numel (words(icol, jptr))) = ...
 568                      strrep (words(icol, jptr), fmt_words{ii}, ...
 569                      [char(255) char(254)]);
 570                 wrds(2:2:2*numel (words(icol, jptr))-1) = char(255);
 571                 wrds = strsplit ([wrds{:}], char(255));
 572                 words(icol, jptr) = ...
 573                   wrds(find (cellfun ("isempty", strfind (wrds, char(254)))));
 574                 wrds(find (cellfun ("isempty", strfind (wrds, char(254))))) ...
 575                    = char(255);
 576                 words(icol+1, jptr) = strsplit (strrep ([wrds{2:end}], ...
 577                    char(254), fmt_words{ii}), char(255));
 578                 ## Former trailing literal may now be leading for next specifier
 579                 --ii;
 580               endif
 581             endif
 582           endif
 583
 584         else
 585           ## Conv. specifier.  Peek if next fmt_word needs split from current column
 586           if (ii < num_words_per_line)
 587             if (idf(ii+1) && (! isempty (strfind (words{icol, 1}, fmt_words{ii+1}))))
 588               --icol;
 589             elseif (idg(ii+1))
 590               --icol;
 591             endif
 592           endif
 593         endif
 594         ## Next fmt_word, next column
 595         ++ii; ++icol;
 596       endwhile
 597
 598       ## Done.  Reshape words back into 1 long vector and strip padded empty words
 599       words = reshape (words, 1, numel (words))(1 : end-num_words_padded);
 600
 601     catch
 602       warning ("strread: unable to parse text or file with given format string");
 603       return;
 604
 605     end_try_catch
 606   endif
 607
 608   ## For each specifier, process corresponding column
 609   k = 1;
 610   for m = 1:num_words_per_line
 611     try
 612       if (format_repeat_count < 0)
 613         data = words(m:num_words_per_line:end);
 614       elseif (format_repeat_count == 0)
 615         data = {};
 616       else
 617         lastline = ...
 618           min (num_words_per_line * format_repeat_count + m - 1, numel (words));
 619         data = words(m:num_words_per_line:lastline);
 620       endif
 621
 622       ## Map to format
 623       ## FIXME - add support for formats like "<%s>", "%[a-zA-Z]"
 624       ##         Someone with regexp experience is needed.
 625       switch fmt_words{m}(1:min (2, length (fmt_words{m})))
 626         case "%s"
 627           if (pad_out)
 628             data(end+1:num_lines) = {""};
 629           endif
 630           varargout{k} = data';
 631           k++;
 632         case {"%d", "%u", "%f", "%n"}
 633           n = cellfun ("isempty", data);
 634           ### FIXME - erroneously formatted data lead to NaN, not an error
 635           data = str2double (data);
 636           if (! isempty (regexp (fmt_words{m}, "%[du]")))
 637             ## Cast to integer
 638             ## FIXME: NaNs will be transformed into zeros
 639             data = int32 (data);
 640           endif
 641           data(n) = numeric_fill_value;
 642           if (pad_out)
 643             data(end+1:num_lines) = numeric_fill_value;
 644           endif
 645           varargout{k} = data.';
 646           k++;
 647         case {"%0", "%1", "%2", "%3", "%4", "%5", "%6", "%7", "%8", "%9"}
 648           nfmt = strsplit (fmt_words{m}(2:end-1), '.');
 649           swidth = str2double (nfmt{1});
 650           switch fmt_words{m}(end)
 651             case {"d", "u", "f", "n%"}
 652               n = cellfun ("isempty", data);
 653               ### FIXME - erroneously formatted data lead to NaN, not an error
 654               ###         => ReturnOnError can't be implemented for numeric data
 655               data = str2double (strtrunc (data, swidth));
 656               data(n) = numeric_fill_value;
 657               if (pad_out)
 658                 data(end+1:num_lines) = numeric_fill_value;
 659               endif
 660               if (numel (nfmt) > 1)
 661                 sprec = str2double (nfmt{2});
 662                 data = 10^-sprec * round (10^sprec * data);
 663               elseif (! isempty (regexp (fmt_words{m}, "[du]")))
 664                 ## Cast to integer
 665                 ## FIXME: NaNs will be transformed into zeros
 666                 data = int32 (data);
 667               endif
 668               varargout{k} = data.';
 669               k++;
 670             case "s"
 671               if (pad_out)
 672                 data(end+1:num_lines) = {""};
 673               endif
 674               varargout{k} = strtrunc (data, swidth)';
 675               k++;
 676             otherwise
 677           endswitch
 678         case {"%*", "%*s"}
 679           ## skip the word
 680         otherwise
 681           ## Ensure descriptive content is consistent.
 682           ## Test made a bit lax to accomodate for incomplete last lines
 683           n = find (! cellfun ("isempty", data));
 684           if (numel (unique (data(n))) > 1
 685               || ! strcmpi (unique (data), fmt_words{m}))
 686             error ("strread: FORMAT does not match data");
 687           endif
 688       endswitch
 689     catch
 690       ## As strread processes columnwise, ML-compatible error processing
 691       ## (row after row) is not feasible. In addition Octave sets unrecognizable
 692       ## numbers to NaN w/o error.  But maybe Octave is better in this respect.
 693       if (err_action)
 694         ## Just try the next column where ML bails out
 695       else
 696         rethrow (lasterror);
 697       endif
 698     end_try_catch
 699   endfor
 700
 701 endfunction
 702
 703 function out = split_by (text, sep, mult_dlms_s1, eol_char)
 704
 705   ## Check & if needed, process MultipleDelimsAsOne parameter
 706   if (mult_dlms_s1)
 707     mult_dlms_s1 = true;
 708     ## FIXME: Should re-implement strsplit() function here in order
 709     ## to avoid strrep on megabytes of data.
 710     ## If \n is in sep collection we need to enclose it in text
 711     ## to avoid it being included in consecutive delim series
 712     enchr = ' ';
 713     ## However watch out if eol_char is also in delimiters
 714     if (index (sep, eol_char)); enchr = char(255); endif
 715     text = strrep (text, eol_char, [enchr eol_char enchr]);
 716   else
 717     mult_dlms_s1 = false;
 718   endif
 719
 720   ## Split text string along delimiters
 721   out = strsplit (text, sep, mult_dlms_s1);
 722   if (index (sep, eol_char)); out = strrep (out, char(255), ''); endif
 723   ## In case of trailing delimiter, strip stray last empty word
 724   if (!isempty (out) && any (sep == text(end)))
 725     out(end) = [];
 726   endif
 727
 728   ## Empty cells converted to empty cellstrings.
 729   out(cellfun ("isempty", out)) = {""};
 730
 731 endfunction
 732
 733
 734 %!test
 735 %! [a, b] = strread ("1 2", "%f%f");
 736 %! assert (a, 1);
 737 %! assert (b, 2);
 738
 739 %!test
 740 %! str = '';
 741 %! a = rand (10, 1);
 742 %! b = char (randi ([65, 85], 10, 1));
 743 %! for k = 1:10
 744 %!   str = sprintf ('%s %.6f %s\n', str, a(k), b(k));
 745 %! endfor
 746 %! [aa, bb] = strread (str, '%f %s');
 747 %! assert (a, aa, 1e-6);
 748 %! assert (cellstr (b), bb);
 749
 750 %!test
 751 %! str = '';
 752 %! a = rand (10, 1);
 753 %! b = char (randi ([65, 85], 10, 1));
 754 %! for k = 1:10
 755 %!   str = sprintf ('%s %.6f %s\n', str, a(k), b(k));
 756 %! endfor
 757 %! aa = strread (str, '%f %*s');
 758 %! assert (a, aa, 1e-6);
 759
 760 %!test
 761 %! str = sprintf ('/* this is\nacomment*/ 1 2 3');
 762 %! a = strread (str, '%f', 'commentstyle', 'c');
 763 %! assert (a, [1; 2; 3]);
 764
 765 %!test
 766 %! str = "# comment\n# comment\n1 2 3";
 767 %! [a, b] = strread (str, '%n %s', 'commentstyle', 'shell', 'endofline', "\n");
 768 %! assert (a, [1; 3]);
 769 %! assert (b, {"2"});
 770
 771 %!test
 772 %! str = sprintf ("Tom 100 miles/hr\nDick 90 miles/hr\nHarry 80 miles/hr");
 773 %! fmt = "%s %f miles/hr";
 774 %! c = cell (1, 2);
 775 %! [c{:}] = strread (str, fmt);
 776 %! assert (c{1}, {"Tom"; "Dick"; "Harry"})
 777 %! assert (c{2}, [100; 90; 80])
 778
 779 %!test
 780 %! a = strread ("a b c, d e, , f", "%s", "delimiter", ",");
 781 %! assert (a, {"a b c"; "d e"; ""; "f"});
 782
 783 %!test
 784 %! # Bug #33536
 785 %! [a, b, c] = strread ("1,,2", "%s%s%s", "delimiter", ",");
 786 %! assert (a{1}, '1');
 787 %! assert (b{1}, '');
 788 %! assert (c{1}, '2');
 789
 790 %!test
 791 %! # Bug #33536
 792 %! a = strread ("[SomeText]", "[%s", "delimiter", "]");
 793 %! assert (a{1}, "SomeText");
 794
 795 %!test
 796 %! dat = "Data file.\r\n=  =  =  =  =\r\nCOMPANY    : <Company name>\r\n";
 797 %! a = strread (dat, "%s", 'delimiter', "\n", 'whitespace', '', 'endofline', "\r\n");
 798 %! assert (a{2}, "=  =  =  =  =");
 799 %! assert (double (a{3}(end-5:end)), [32 110 97 109 101 62]);
 800
 801 %!test
 802 %! [a, b, c, d] = strread ("1,2,3,,5,6", "%d%f%d%f", 'delimiter', ',');
 803 %! assert (c, int32 (3));
 804 %! assert (d, NaN);
 805
 806 %!test
 807 %! [a, b, c, d] = strread ("1,2,3,,5,6\n", "%d%d%f%d", 'delimiter', ',');
 808 %! assert (c, [3; NaN]);
 809 %! assert (d, int32 ([0; 0]));
 810
 811 %!test
 812 %! # Default format (= %f)
 813 %1 [a, b, c] = strread ("0.12 0.234 0.3567");
 814 %1 assert (a, 0.12);
 815 %1 assert (b, 0.234);
 816 %1 assert (c, 0.3567);
 817
 818 %!test
 819 %! [a, b] = strread('0.41 8.24 3.57 6.24 9.27', "%f%f", 2, 'delimiter', ' ');
 820 %1 assert (a, [0.41; 3.57]);
 821
 822 %!test
 823 %! # TreatAsEmpty
 824 %! [a, b, c, d] = strread ("1,2,3,NN,5,6\n", "%d%d%d%f", 'delimiter', ',', 'TreatAsEmpty', 'NN');
 825 %! assert (c, int32 ([3; 0]));
 826 %! assert (d, [NaN; NaN]);
 827
 828 %!test
 829 %! # No delimiters at all besides EOL.  Plain reading numbers & strings
 830 %! str = "Text1Text2Text\nText398Text4Text\nText57Text";
 831 %! [a, b] = strread (str, "Text%dText%1sText");
 832 %! assert (a, int32 ([1; 398; 57]));
 833 %! assert (b(1:2), {'2'; '4'});
 834 %! assert (isempty (b{3}), true);
 835
 836 %% MultipleDelimsAsOne
 837 %!test
 838 %! str = "11, 12, 13,, 15\n21,, 23, 24, 25\n,, 33, 34, 35";
 839 %! [a b c d] = strread (str, "%f %f %f %f", 'delimiter', ',', 'multipledelimsasone', 1, 'endofline', "\n");
 840 %! assert (a', [11, 21, NaN]);
 841 %! assert (b', [12, 23, 33]);
 842 %! assert (c', [13, 24, 34]);
 843 %! assert (d', [15, 25, 35]);
 844
 845 %% delimiter as sq_string and dq_string
 846 %!test
 847 %! assert (strread ("1\n2\n3", "%d", "delimiter", "\n"),
 848 %!         strread ("1\n2\n3", "%d", "delimiter", '\n'))
 849
 850 %% whitespace as sq_string and dq_string
 851 %!test
 852 %! assert (strread ("1\b2\r3\b4\t5", "%d", "whitespace", "\b\r\n\t"),
 853 %!         strread ("1\b2\r3\b4\t5", "%d", "whitespace", '\b\r\n\t'))
 854
 855 %!test
 856 %! str =  "0.31 0.86 0.94\n 0.60 0.72 0.87";
 857 %! fmt = "%f %f %f";
 858 %! args = {"delimiter", " ", "endofline", "\n", "whitespace", " "};
 859 %! [a, b, c] = strread (str, fmt, args {:});
 860 %! assert (a, [0.31; 0.60], 0.01)
 861 %! assert (b, [0.86; 0.72], 0.01)
 862 %! assert (c, [0.94; 0.87], 0.01)
 863
 864 %!test
 865 %! str =  "0.31,0.86,0.94\n0.60,0.72,0.87";
 866 %! fmt = "%f %f %f";
 867 %! args = {"delimiter", ",", "endofline", "\n", "whitespace", " "};
 868 %! [a, b, c] = strread (str, fmt, args {:});
 869 %! assert (a, [0.31; 0.60], 0.01)
 870 %! assert (b, [0.86; 0.72], 0.01)
 871 %! assert (c, [0.94; 0.87], 0.01)
 872
 873 %!test
 874 %! str =  "0.31 0.86 0.94\n 0.60 0.72 0.87";
 875 %! fmt = "%f %f %f";
 876 %! args = {"delimiter", ",", "endofline", "\n", "whitespace", " "};
 877 %! [a, b, c] = strread (str, fmt, args {:});
 878 %! assert (a, [0.31; 0.60], 0.01)
 879 %! assert (b, [0.86; 0.72], 0.01)
 880 %! assert (c, [0.94; 0.87], 0.01)
 881
 882 %!test
 883 %! str =  "0.31, 0.86, 0.94\n 0.60, 0.72, 0.87";
 884 %! fmt = "%f %f %f";
 885 %! args = {"delimiter", ",", "endofline", "\n", "whitespace", " "};
 886 %! [a, b, c] = strread (str, fmt, args {:});
 887 %! assert (a, [0.31; 0.60], 0.01)
 888 %! assert (b, [0.86; 0.72], 0.01)
 889 %! assert (c, [0.94; 0.87], 0.01)