X-Git-Url: https://git.creatis.insa-lyon.fr/pubgit/?a=blobdiff_plain;ds=sidebyside;f=octave_packages%2Fm%2Fio%2Fstrread.m;fp=octave_packages%2Fm%2Fio%2Fstrread.m;h=03204443d26f5954bec7620f1b9b953212ce97e2;hb=1c0469ada9531828709108a4882a751d2816994a;hp=0000000000000000000000000000000000000000;hpb=63de9f36673d49121015e3695f2c336ea92bc278;p=CreaPhase.git diff --git a/octave_packages/m/io/strread.m b/octave_packages/m/io/strread.m new file mode 100644 index 0000000..0320444 --- /dev/null +++ b/octave_packages/m/io/strread.m @@ -0,0 +1,889 @@ +## Copyright (C) 2009-2012 Eric Chassande-Mottin, CNRS (France) +## Copyright (C) 2012 Philip Nienhuis +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{a}, @dots{}] =} strread (@var{str}) +## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}) +## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat}) +## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{prop1}, @var{value1}, @dots{}) +## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat}, @var{prop1}, @var{value1}, @dots{}) +## Read data from a string. +## +## The string @var{str} is split into words that are repeatedly matched to the +## specifiers in @var{format}. The first word is matched to the first +## specifier, the second to the second specifier and so forth. If there are +## more words than specifiers, the process is repeated until all words have +## been processed. +## +## The string @var{format} describes how the words in @var{str} should be +## parsed. +## It may contain any combination of the following specifiers: +## +## @table @code +## @item %s +## The word is parsed as a string. +## +## @itemx %f +## @itemx %n +## The word is parsed as a number and converted to double. +## +## @item %d +## @itemx %u +## The word is parsed as a number and converted to int32. +## +## @item %*', '%*f', '%*s +## The word is skipped. +## +## For %s and %d, %f, %n, %u and the associated %*s @dots{} specifiers an +## optional width can be specified as %Ns, etc. where N is an integer > 1. +## For %f, format specifiers like %N.Mf are allowed. +## +## @item literals +## In addition the format may contain literal character strings; these will be +## skipped during reading. +## @end table +## +## Parsed word corresponding to the first specifier are returned in the first +## output argument and likewise for the rest of the specifiers. +## +## By default, @var{format} is @t{"%f"}, meaning that numbers are read from +## @var{str}. This will do if @var{str} contains only numeric fields. +## +## For example, the string +## +## @example +## @group +## @var{str} = "\ +## Bunny Bugs 5.5\n\ +## Duck Daffy -7.5e-5\n\ +## Penguin Tux 6" +## @end group +## @end example +## +## @noindent +## can be read using +## +## @example +## [@var{a}, @var{b}, @var{c}] = strread (@var{str}, "%s %s %f"); +## @end example +## +## Optional numeric argument @var{format_repeat} can be used for +## limiting the number of items read: +## +## @table @asis +## @item -1 +## (default) read all of the string until the end. +## +## @item N +## Read N times @var{nargout} items. 0 (zero) is an acceptable +## value for @var{format_repeat}. +## @end table +## +## The behavior of @code{strread} can be changed via property-value +## pairs. The following properties are recognized: +## +## @table @asis +## @item "commentstyle" +## Parts of @var{str} are considered comments and will be skipped. +## @var{value} is the comment style and can be any of the following. +## @itemize +## @item "shell" +## Everything from @code{#} characters to the nearest end-of-line is skipped. +## +## @item "c" +## Everything between @code{/*} and @code{*/} is skipped. +## +## @item "c++" +## Everything from @code{//} characters to the nearest end-of-line is skipped. +## +## @item "matlab" +## Everything from @code{%} characters to the nearest end-of-line is skipped. +## +## @item user-supplied. Two options: +## (1) One string, or 1x1 cell string: Skip everything to the right of it; +## (2) 2x1 cell string array: Everything between the left and right strings +## is skipped. +## @end itemize +## +## @item "delimiter" +## Any character in @var{value} will be used to split @var{str} into words +## (default value = any whitespace). +## +## @item "emptyvalue": +## Value to return for empty numeric values in non-whitespace delimited data. +## The default is NaN@. When the data type does not support NaN +## (int32 for example), then default is zero. +## +## @item "multipledelimsasone" +## Treat a series of consecutive delimiters, without whitespace in between, +## as a single delimiter. Consecutive delimiter series need not be vertically +## "aligned". +## +## @item "treatasempty" +## Treat single occurrences (surrounded by delimiters or whitespace) of the +## string(s) in @var{value} as missing values. +## +## @item "returnonerror" +## If @var{value} true (1, default), ignore read errors and return normally. +## If false (0), return an error. +## +## @item "whitespace" +## Any character in @var{value} will be interpreted as whitespace and +## trimmed; the string defining whitespace must be enclosed in double +## quotes for proper processing of special characters like \t. +## The default value for whitespace = " \b\r\n\t" (note the space). +## Unless whitespace is set to '' (empty) AND at least one "%s" format +## conversion specifier is supplied, a space is always part of whitespace. +## +## @end table +## +## @seealso{textscan, textread, load, dlmread, fscanf} +## @end deftypefn + +function varargout = strread (str, format = "%f", varargin) + + ## Check input + if (nargin < 1) + print_usage (); + endif + + if (isempty (format)) + format = "%f"; + endif + + if (! ischar (str) || ! ischar (format)) + error ("strread: STR and FORMAT arguments must be strings"); + endif + + ## Parse format string to compare number of conversion fields and nargout + nfields = length (strfind (format, "%")) - length (strfind (format, "%*")); + ## If str only has numeric fields, a (default) format ("%f") will do. + ## Otherwise: + if ((max (nargout, 1) != nfields) && ! strcmp (format, "%f")) + error ("strread: the number of output variables must match that specified by FORMAT"); + endif + + ## Check for format string repeat count + format_repeat_count = -1; + if (nargin > 2 && isnumeric (varargin{1})) + if (varargin{1} >= 0) + format_repeat_count = varargin{1}; + endif + if (nargin > 3) + varargin = varargin(2:end); + else + varargin = {}; + endif + endif + + ## Parse options. First initialize defaults + comment_flag = false; + delimiter_str = ""; + empty_str = ""; + eol_char = ""; + err_action = 0; + mult_dlms_s1 = false; + numeric_fill_value = NaN; + white_spaces = " \b\r\n\t"; + for n = 1:2:length (varargin) + switch (lower (varargin{n})) + case "bufsize" + ## We could synthesize this, but that just seems weird... + warning ('strread: property "bufsize" is not implemented'); + case "commentstyle" + comment_flag = true; + switch (lower (varargin{n+1})) + case "c" + [comment_start, comment_end] = deal ("/*", "*/"); + case "c++" + [comment_start, comment_end] = deal ("//", "eol_char"); + case "shell" + [comment_start, comment_end] = deal ("#" , "eol_char"); + case "matlab" + [comment_start, comment_end] = deal ("%" , "eol_char"); + otherwise + if (ischar (varargin{n+1}) || + (numel (varargin{n+1}) == 1 && iscellstr (varargin{n+1}))) + [comment_start, comment_end] = deal (char (varargin{n+1}), "eol_char"); + elseif (iscellstr (varargin{n+1}) && numel (varargin{n+1}) == 2) + [comment_start, comment_end] = deal (varargin{n+1}{:}); + else + ## FIXME - a user may have numeric values specified: {'//', 7} + ## this will lead to an error in the warning message + error ("strread: unknown or unrecognized comment style '%s'", + varargin{n+1}); + endif + endswitch + case "delimiter" + delimiter_str = varargin{n+1}; + if (strcmp (typeinfo (delimiter_str), "sq_string")) + delimiter_str = do_string_escapes (delimiter_str); + endif + case "emptyvalue" + numeric_fill_value = varargin{n+1}; + case "expchars" + warning ('strread: property "expchars" is not implemented'); + case "whitespace" + white_spaces = varargin{n+1}; + if (strcmp (typeinfo (white_spaces), "sq_string")) + white_spaces = do_string_escapes (white_spaces); + endif + ## The following parameters are specific to textscan and textread + case "endofline" + eol_char = varargin{n+1}; + if (strcmp (typeinfo (eol_char), "sq_string")) + eol_char = do_string_escapes (eol_char); + endif + case "returnonerror" + err_action = varargin{n+1}; + case "multipledelimsasone" + mult_dlms_s1 = varargin{n+1}; + case "treatasempty" + if (iscellstr (varargin{n+1})) + empty_str = varargin{n+1}; + elseif (ischar (varargin{n+1})) + empty_str = varargin(n+1); + else + error ('strread: "treatasempty" value must be string or cellstr'); + endif + otherwise + warning ('strread: unknown property "%s"', varargin{n}); + endswitch + endfor + + ## First parse of FORMAT + if (strcmpi (strtrim (format), "%f")) + ## Default format specified. Expand it (to desired nargout) + fmt_words = cell (nargout, 1); + fmt_words (1:nargout) = format; + else + ## Determine the number of words per line as a first guess. Forms + ## like %f) (w/o delimiter in between) are fixed further on + format = strrep (format, "%", " %"); + fmt_words = regexp (format, '[^ ]+', 'match'); + ## Format conversion specifiers following literals w/o space/delim + ## in between are separate now. Separate those w trailing literals + idy2 = find (! cellfun ("isempty", strfind (fmt_words, "%"))); + a = strfind (fmt_words(idy2), "%"); + b = regexp (fmt_words(idy2), '[nfdus]', 'end'); + for jj = 1:numel (a) + ii = numel (a) - jj + 1; + if (! (length (fmt_words{idy2(ii)}) == b{ii}(1))) + ## Fix format_words + fmt_words(idy2(ii)+1 : end+1) = fmt_words(idy2(ii) : end); + fmt_words{idy2(ii)} = fmt_words{idy2(ii)}(a{ii} : b{ii}(1)); + fmt_words{idy2(ii)+1} = fmt_words{idy2(ii)+1}(b{ii}+1:end); + endif + endfor + endif + num_words_per_line = numel (fmt_words); + + ## Special handling for CRLF EOL character in str + if (! isempty (eol_char) && strcmp (eol_char, "\r\n")) + ## Strip CR from CRLF sequences + str = strrep (str, "\r\n", "\n"); + ## CR serves no further purpose in function + eol_char = "\n"; + endif + + ## Remove comments in str + if (comment_flag) + ## Expand 'eol_char' here, after option processing which may have set value + comment_end = regexprep (comment_end, 'eol_char', eol_char); + cstart = strfind (str, comment_start); + cstop = strfind (str, comment_end); + ## Treat end of string as additional comment stop + if (isempty (cstop) || cstop(end) != length (str)) + cstop(end+1) = length (str); + endif + if (! isempty (cstart)) + ## Ignore nested openers. + [idx, cidx] = unique (lookup (cstop, cstart), "first"); + if (idx(end) == length (cstop)) + cidx(end) = []; # Drop the last one if orphaned. + endif + cstart = cstart(cidx); + endif + if (! isempty (cstop)) + ## Ignore nested closers. + [idx, cidx] = unique (lookup (cstart, cstop), "first"); + if (idx(1) == 0) + cidx(1) = []; # Drop the first one if orphaned. + endif + cstop = cstop(cidx); + endif + len = length (str); + c2len = length (comment_end); + str = cellslices (str, [1, cstop + c2len], [cstart - 1, len]); + str = [str{:}]; + endif + + if (! isempty (white_spaces)) + ## For numeric fields, whitespace is always a delimiter, but not for text fields + if (isempty (strfind (format, "%s"))) + ## Add whitespace to delimiter set + delimiter_str = unique ([white_spaces delimiter_str]); + else + ## Remove any delimiter chars from white_spaces list + white_spaces = setdiff (white_spaces, delimiter_str); + endif + endif + if (isempty (delimiter_str)) + delimiter_str = " "; + endif + if (! isempty (eol_char)) + ## Add eol_char to delimiter collection + delimiter_str = unique ([delimiter_str eol_char]); + ## .. and remove it from whitespace collection + white_spaces = strrep (white_spaces, eol_char, ''); + endif + + pad_out = 0; + ## Trim whitespace if needed + if (! isempty (white_spaces)) + ## Check if trailing "\n" might signal padding output arrays to equal size + ## before it is trimmed away below + if ((str(end) == 10) && (nargout > 1)) + pad_out = 1; + endif + ## Condense all repeated whitespace into one single space + ## FIXME: this will also fold repeated whitespace in a char field + rxp_wsp = sprintf ("[%s]+", white_spaces); + str = regexprep (str, rxp_wsp, ' '); + ## Remove possible leading space at string + if (str(1) == 32) + str = str(2:end); + endif + ## Check for single delimiter followed/preceded by whitespace + ## FIXME: Double strrep on str is enormously expensive of CPU time. + ## Can this be eliminated + if (! isempty (delimiter_str)) + dlmstr = setdiff (delimiter_str, " "); + rxp_dlmwsp = sprintf ("( [%s]|[%s] )", dlmstr, dlmstr); + str = regexprep (str, rxp_dlmwsp, delimiter_str(1)); + endif + ## FIXME: Double strrep on str is enormously expensive of CPU time. + ## Can this be eliminated + ## Wipe leading and trailing whitespace on each line (it may be delimiter too) + if (! isempty (eol_char)) + str = strrep (str, [eol_char " "], eol_char); + str = strrep (str, [" " eol_char], eol_char); + endif + endif + + ## Split 'str' into words + words = split_by (str, delimiter_str, mult_dlms_s1, eol_char); + if (! isempty (white_spaces)) + ## Trim leading and trailing white_spaces + ## FIXME: Is this correct? strtrim clears what matches isspace(), not + ## necessarily what is in white_spaces. + words = strtrim (words); + endif + num_words = numel (words); + ## First guess at number of lines in file (ignoring leading/trailing literals) + num_lines = ceil (num_words / num_words_per_line); + + ## Replace TreatAsEmpty char sequences by empty strings + if (! isempty (empty_str)) + for ii = 1:numel (empty_str) + idz = strmatch (empty_str{ii}, words, "exact"); + words(idz) = {""}; + endfor + endif + + ## fmt_words has been split properly now, but words{} has only been split on + ## delimiter positions. + ## As numeric fields can also be separated by whitespace, more splits may be + ## needed. + ## We also don't know the number of lines (as EndOfLine may have been set to + ## "" (empty) by the caller). + ## + ## We also may have to cope with 3 cases as far as literals go: + ## A: Trailing literals (%f) w/o delimiter in between. + ## B: Leading literals (%f) w/o delimiter in between. + ## C. Skipping leftover parts of specified skip fields (%*N ) + ## Some words columns may have to be split further to fix these. + + ## Find indices and pointers to possible literals in fmt_words + idf = cellfun ("isempty", strfind (fmt_words, "%")); + ## Find indices and pointers to conversion specifiers with fixed width + idg = ! cellfun ("isempty", regexp (fmt_words, '%\*?\d')); + idy = find (idf | idg); + ## Find indices to numeric conversion specifiers + idn = ! cellfun ("isempty", regexp (fmt_words, "%[dnfu]")); + + ## If needed, split up columns in three steps: + if (! isempty (idy)) + ## Try-catch because complexity of strings to read can be infinite + try + + ## 1. Assess "period" in the split-up words array ( < num_words_per_line). + ## Could be done using EndOfLine but that prohibits EndOfLine = "" option. + ## Alternative below goes by simply parsing a first grab of words + ## and counting words until the fmt_words array is exhausted: + iwrd = 1; iwrdp = 0; iwrdl = length (words{iwrd}); + for ii = 1:numel (fmt_words) + + nxt_wrd = 0; + + if (idf(ii)) + ## Literal expected + if (isempty (strfind (fmt_words{ii}, words(iwrd)))) + ## Not found in current word; supposed to be in next word + nxt_wrd = 1; + else + ## Found it in current word. Subtract literal length + iwrdp += length (fmt_words{ii}); + if (iwrdp > iwrdl) + ## Parse error. Literal extends beyond delimiter (word boundary) + warning ("strread: literal '%s' (fmt spec # %d) does not match data", ... + fmt_words{ii}, ii); + ## Word assumed to be completely "used up". Next word + nxt_wrd = 1; + elseif (iwrdp == iwrdl) + ## Word completely "used up". Next word + nxt_wrd = 1; + endif + endif + + elseif (idg(ii)) + ## Fixed width specifier (%N or %*N): read just a part of word + iwrdp += floor ... + (str2double (fmt_words{ii}(regexp(fmt_words{ii}, '\d') : end-1))); + if (iwrdp > iwrdl) + ## Match error. Field extends beyond word boundary. + warning ... + ("strread: field width '%s' (fmt spec # %d) extends beyond actual word limit", ... + fmt_words{ii}, ii); + ## Assume word to be completely "used up". Next word + nxt_wrd = 1; + elseif (iwrdp == iwrdl) + ## Word completely "used up". Next word + nxt_wrd = 1; + endif + + else + ## A simple format conv. specifier. Either (1) uses rest of word, or + ## (2) is squeezed between current iwrdp and next literal, or (3) uses + ## next word. (3) is already taken care of. So just check (1) & (2) + if (ii < numel (fmt_words) && idf(ii+1)) + ## Next fmt_word is a literal... + if (! index (words{iwrd}(iwrdp+1:end), fmt_words{ii+1})) + ## ...but not found in current word => field uses rest of word + nxt_wrd = 1; + else + ## ..or it IS found. Add inferred width of current conversion field + iwrdp += index (words{iwrd}(iwrdp+1:end), fmt_words{ii+1}) - 1; + endif + elseif (iwrdp < iwrdl) + ## No bordering literal to the right => field occupies (rest of) word + nxt_wrd = 1; + endif + + endif + + if (nxt_wrd) + ++iwrd; iwrdp = 0; + if (ii < numel (fmt_words)) + iwrdl = length (words{iwrd}); + endif + endif + + endfor + ## Done + words_period = max (iwrd - 1, 1); + num_lines = ceil (num_words / words_period); + + ## 2. Pad words array so that it can be reshaped + tmp_lines = ceil (num_words / words_period); + num_words_padded = tmp_lines * words_period - num_words; + if (num_words_padded) + words = [words'; cell(num_words_padded, 1)]; + endif + words = reshape (words, words_period, tmp_lines); + + ## 3. Do the column splitting on rectangular words array + icol = 1; ii = 1; # icol = current column, ii = current fmt_word + while (ii <= num_words_per_line) + + ## Check if fmt_words(ii) contains a literal or fixed-width + if ((idf(ii) || idg(ii)) && (rows(words) < num_words_per_line)) + if (idf(ii)) + s = strfind (words(icol, 1), fmt_words{ii}); + if (isempty (s{:})) + error ("strread: Literal '%s' not found in column %d", fmt_words{ii}, icol); + endif + s = s{:}(1); + e = s(1) + length (fmt_words{ii}) - 1; + endif + if (! strcmp (fmt_words{ii}, words{icol, 1})) + ## Column doesn't exactly match literal => split needed. Insert a column + words(icol+1:end+1, :) = words(icol:end, :); + ## Watch out for empty cells + jptr = find (! cellfun ("isempty", words(icol, :))); + + ## Distinguish leading or trailing literals + if (! idg(ii) && ! isempty (s) && s(1) == 1) + ## Leading literal. Assign literal to icol, paste rest in icol + 1 + ## Apply only to those cells that do have something beyond literal + jptr = find (cellfun("length", words(icol+1, jptr), ... + "UniformOutput", false) > e(1)); + words(icol+1, :) = {""}; + words(icol+1, jptr) = cellfun ... + (@(x) substr(x, e(1)+1, length(x)-e(1)), words(icol, jptr), ... + "UniformOutput", false); + words(icol, jptr) = fmt_words{ii}; + + else + if (! idg(ii) && ! isempty (strfind (fmt_words{ii-1}, "%s"))) + ## Trailing literal. If preceding format == '%s' this is an error + warning ("Ambiguous '%s' specifier next to literal in column %d", icol); + elseif (idg(ii)) + ## Current field = fixed width. Strip into icol, rest in icol+1 + wdth = floor (str2double (fmt_words{ii}(regexp(fmt_words{ii}, ... + '\d') : end-1))); + words(icol+1, jptr) = cellfun (@(x) x(wdth+1:end), + words(icol,jptr), "UniformOutput", false); + words(icol, jptr) = strtrunc (words(icol, jptr), wdth); + else + ## FIXME: this assumes char(254)/char(255) won't occur in input! + clear wrds; + wrds(1:2:2*numel (words(icol, jptr))) = ... + strrep (words(icol, jptr), fmt_words{ii}, ... + [char(255) char(254)]); + wrds(2:2:2*numel (words(icol, jptr))-1) = char(255); + wrds = strsplit ([wrds{:}], char(255)); + words(icol, jptr) = ... + wrds(find (cellfun ("isempty", strfind (wrds, char(254))))); + wrds(find (cellfun ("isempty", strfind (wrds, char(254))))) ... + = char(255); + words(icol+1, jptr) = strsplit (strrep ([wrds{2:end}], ... + char(254), fmt_words{ii}), char(255)); + ## Former trailing literal may now be leading for next specifier + --ii; + endif + endif + endif + + else + ## Conv. specifier. Peek if next fmt_word needs split from current column + if (ii < num_words_per_line) + if (idf(ii+1) && (! isempty (strfind (words{icol, 1}, fmt_words{ii+1})))) + --icol; + elseif (idg(ii+1)) + --icol; + endif + endif + endif + ## Next fmt_word, next column + ++ii; ++icol; + endwhile + + ## Done. Reshape words back into 1 long vector and strip padded empty words + words = reshape (words, 1, numel (words))(1 : end-num_words_padded); + + catch + warning ("strread: unable to parse text or file with given format string"); + return; + + end_try_catch + endif + + ## For each specifier, process corresponding column + k = 1; + for m = 1:num_words_per_line + try + if (format_repeat_count < 0) + data = words(m:num_words_per_line:end); + elseif (format_repeat_count == 0) + data = {}; + else + lastline = ... + min (num_words_per_line * format_repeat_count + m - 1, numel (words)); + data = words(m:num_words_per_line:lastline); + endif + + ## Map to format + ## FIXME - add support for formats like "<%s>", "%[a-zA-Z]" + ## Someone with regexp experience is needed. + switch fmt_words{m}(1:min (2, length (fmt_words{m}))) + case "%s" + if (pad_out) + data(end+1:num_lines) = {""}; + endif + varargout{k} = data'; + k++; + case {"%d", "%u", "%f", "%n"} + n = cellfun ("isempty", data); + ### FIXME - erroneously formatted data lead to NaN, not an error + data = str2double (data); + if (! isempty (regexp (fmt_words{m}, "%[du]"))) + ## Cast to integer + ## FIXME: NaNs will be transformed into zeros + data = int32 (data); + endif + data(n) = numeric_fill_value; + if (pad_out) + data(end+1:num_lines) = numeric_fill_value; + endif + varargout{k} = data.'; + k++; + case {"%0", "%1", "%2", "%3", "%4", "%5", "%6", "%7", "%8", "%9"} + nfmt = strsplit (fmt_words{m}(2:end-1), '.'); + swidth = str2double (nfmt{1}); + switch fmt_words{m}(end) + case {"d", "u", "f", "n%"} + n = cellfun ("isempty", data); + ### FIXME - erroneously formatted data lead to NaN, not an error + ### => ReturnOnError can't be implemented for numeric data + data = str2double (strtrunc (data, swidth)); + data(n) = numeric_fill_value; + if (pad_out) + data(end+1:num_lines) = numeric_fill_value; + endif + if (numel (nfmt) > 1) + sprec = str2double (nfmt{2}); + data = 10^-sprec * round (10^sprec * data); + elseif (! isempty (regexp (fmt_words{m}, "[du]"))) + ## Cast to integer + ## FIXME: NaNs will be transformed into zeros + data = int32 (data); + endif + varargout{k} = data.'; + k++; + case "s" + if (pad_out) + data(end+1:num_lines) = {""}; + endif + varargout{k} = strtrunc (data, swidth)'; + k++; + otherwise + endswitch + case {"%*", "%*s"} + ## skip the word + otherwise + ## Ensure descriptive content is consistent. + ## Test made a bit lax to accomodate for incomplete last lines + n = find (! cellfun ("isempty", data)); + if (numel (unique (data(n))) > 1 + || ! strcmpi (unique (data), fmt_words{m})) + error ("strread: FORMAT does not match data"); + endif + endswitch + catch + ## As strread processes columnwise, ML-compatible error processing + ## (row after row) is not feasible. In addition Octave sets unrecognizable + ## numbers to NaN w/o error. But maybe Octave is better in this respect. + if (err_action) + ## Just try the next column where ML bails out + else + rethrow (lasterror); + endif + end_try_catch + endfor + +endfunction + +function out = split_by (text, sep, mult_dlms_s1, eol_char) + + ## Check & if needed, process MultipleDelimsAsOne parameter + if (mult_dlms_s1) + mult_dlms_s1 = true; + ## FIXME: Should re-implement strsplit() function here in order + ## to avoid strrep on megabytes of data. + ## If \n is in sep collection we need to enclose it in text + ## to avoid it being included in consecutive delim series + enchr = ' '; + ## However watch out if eol_char is also in delimiters + if (index (sep, eol_char)); enchr = char(255); endif + text = strrep (text, eol_char, [enchr eol_char enchr]); + else + mult_dlms_s1 = false; + endif + + ## Split text string along delimiters + out = strsplit (text, sep, mult_dlms_s1); + if (index (sep, eol_char)); out = strrep (out, char(255), ''); endif + ## In case of trailing delimiter, strip stray last empty word + if (!isempty (out) && any (sep == text(end))) + out(end) = []; + endif + + ## Empty cells converted to empty cellstrings. + out(cellfun ("isempty", out)) = {""}; + +endfunction + + +%!test +%! [a, b] = strread ("1 2", "%f%f"); +%! assert (a, 1); +%! assert (b, 2); + +%!test +%! str = ''; +%! a = rand (10, 1); +%! b = char (randi ([65, 85], 10, 1)); +%! for k = 1:10 +%! str = sprintf ('%s %.6f %s\n', str, a(k), b(k)); +%! endfor +%! [aa, bb] = strread (str, '%f %s'); +%! assert (a, aa, 1e-6); +%! assert (cellstr (b), bb); + +%!test +%! str = ''; +%! a = rand (10, 1); +%! b = char (randi ([65, 85], 10, 1)); +%! for k = 1:10 +%! str = sprintf ('%s %.6f %s\n', str, a(k), b(k)); +%! endfor +%! aa = strread (str, '%f %*s'); +%! assert (a, aa, 1e-6); + +%!test +%! str = sprintf ('/* this is\nacomment*/ 1 2 3'); +%! a = strread (str, '%f', 'commentstyle', 'c'); +%! assert (a, [1; 2; 3]); + +%!test +%! str = "# comment\n# comment\n1 2 3"; +%! [a, b] = strread (str, '%n %s', 'commentstyle', 'shell', 'endofline', "\n"); +%! assert (a, [1; 3]); +%! assert (b, {"2"}); + +%!test +%! str = sprintf ("Tom 100 miles/hr\nDick 90 miles/hr\nHarry 80 miles/hr"); +%! fmt = "%s %f miles/hr"; +%! c = cell (1, 2); +%! [c{:}] = strread (str, fmt); +%! assert (c{1}, {"Tom"; "Dick"; "Harry"}) +%! assert (c{2}, [100; 90; 80]) + +%!test +%! a = strread ("a b c, d e, , f", "%s", "delimiter", ","); +%! assert (a, {"a b c"; "d e"; ""; "f"}); + +%!test +%! # Bug #33536 +%! [a, b, c] = strread ("1,,2", "%s%s%s", "delimiter", ","); +%! assert (a{1}, '1'); +%! assert (b{1}, ''); +%! assert (c{1}, '2'); + +%!test +%! # Bug #33536 +%! a = strread ("[SomeText]", "[%s", "delimiter", "]"); +%! assert (a{1}, "SomeText"); + +%!test +%! dat = "Data file.\r\n= = = = =\r\nCOMPANY : \r\n"; +%! a = strread (dat, "%s", 'delimiter', "\n", 'whitespace', '', 'endofline', "\r\n"); +%! assert (a{2}, "= = = = ="); +%! assert (double (a{3}(end-5:end)), [32 110 97 109 101 62]); + +%!test +%! [a, b, c, d] = strread ("1,2,3,,5,6", "%d%f%d%f", 'delimiter', ','); +%! assert (c, int32 (3)); +%! assert (d, NaN); + +%!test +%! [a, b, c, d] = strread ("1,2,3,,5,6\n", "%d%d%f%d", 'delimiter', ','); +%! assert (c, [3; NaN]); +%! assert (d, int32 ([0; 0])); + +%!test +%! # Default format (= %f) +%1 [a, b, c] = strread ("0.12 0.234 0.3567"); +%1 assert (a, 0.12); +%1 assert (b, 0.234); +%1 assert (c, 0.3567); + +%!test +%! [a, b] = strread('0.41 8.24 3.57 6.24 9.27', "%f%f", 2, 'delimiter', ' '); +%1 assert (a, [0.41; 3.57]); + +%!test +%! # TreatAsEmpty +%! [a, b, c, d] = strread ("1,2,3,NN,5,6\n", "%d%d%d%f", 'delimiter', ',', 'TreatAsEmpty', 'NN'); +%! assert (c, int32 ([3; 0])); +%! assert (d, [NaN; NaN]); + +%!test +%! # No delimiters at all besides EOL. Plain reading numbers & strings +%! str = "Text1Text2Text\nText398Text4Text\nText57Text"; +%! [a, b] = strread (str, "Text%dText%1sText"); +%! assert (a, int32 ([1; 398; 57])); +%! assert (b(1:2), {'2'; '4'}); +%! assert (isempty (b{3}), true); + +%% MultipleDelimsAsOne +%!test +%! str = "11, 12, 13,, 15\n21,, 23, 24, 25\n,, 33, 34, 35"; +%! [a b c d] = strread (str, "%f %f %f %f", 'delimiter', ',', 'multipledelimsasone', 1, 'endofline', "\n"); +%! assert (a', [11, 21, NaN]); +%! assert (b', [12, 23, 33]); +%! assert (c', [13, 24, 34]); +%! assert (d', [15, 25, 35]); + +%% delimiter as sq_string and dq_string +%!test +%! assert (strread ("1\n2\n3", "%d", "delimiter", "\n"), +%! strread ("1\n2\n3", "%d", "delimiter", '\n')) + +%% whitespace as sq_string and dq_string +%!test +%! assert (strread ("1\b2\r3\b4\t5", "%d", "whitespace", "\b\r\n\t"), +%! strread ("1\b2\r3\b4\t5", "%d", "whitespace", '\b\r\n\t')) + +%!test +%! str = "0.31 0.86 0.94\n 0.60 0.72 0.87"; +%! fmt = "%f %f %f"; +%! args = {"delimiter", " ", "endofline", "\n", "whitespace", " "}; +%! [a, b, c] = strread (str, fmt, args {:}); +%! assert (a, [0.31; 0.60], 0.01) +%! assert (b, [0.86; 0.72], 0.01) +%! assert (c, [0.94; 0.87], 0.01) + +%!test +%! str = "0.31,0.86,0.94\n0.60,0.72,0.87"; +%! fmt = "%f %f %f"; +%! args = {"delimiter", ",", "endofline", "\n", "whitespace", " "}; +%! [a, b, c] = strread (str, fmt, args {:}); +%! assert (a, [0.31; 0.60], 0.01) +%! assert (b, [0.86; 0.72], 0.01) +%! assert (c, [0.94; 0.87], 0.01) + +%!test +%! str = "0.31 0.86 0.94\n 0.60 0.72 0.87"; +%! fmt = "%f %f %f"; +%! args = {"delimiter", ",", "endofline", "\n", "whitespace", " "}; +%! [a, b, c] = strread (str, fmt, args {:}); +%! assert (a, [0.31; 0.60], 0.01) +%! assert (b, [0.86; 0.72], 0.01) +%! assert (c, [0.94; 0.87], 0.01) + +%!test +%! str = "0.31, 0.86, 0.94\n 0.60, 0.72, 0.87"; +%! fmt = "%f %f %f"; +%! args = {"delimiter", ",", "endofline", "\n", "whitespace", " "}; +%! [a, b, c] = strread (str, fmt, args {:}); +%! assert (a, [0.31; 0.60], 0.01) +%! assert (b, [0.86; 0.72], 0.01) +%! assert (c, [0.94; 0.87], 0.01)