octave_packages/m/strings/strtok.m

   1 ## Copyright (C) 2000-2012 Paul Kienzle
   2 ##
   3 ## This file is part of Octave.
   4 ##
   5 ## Octave is free software; you can redistribute it and/or modify it
   6 ## under the terms of the GNU General Public License as published by
   7 ## the Free Software Foundation; either version 3 of the License, or (at
   8 ## your option) any later version.
   9 ##
  10 ## Octave is distributed in the hope that it will be useful, but
  11 ## WITHOUT ANY WARRANTY; without even the implied warranty of
  12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 ## General Public License for more details.
  14 ##
  15 ## You should have received a copy of the GNU General Public License
  16 ## along with Octave; see the file COPYING.  If not, see
  17 ## <http://www.gnu.org/licenses/>.
  18
  19 ## -*- texinfo -*-
  20 ## @deftypefn  {Function File} {[@var{tok}, @var{rem}] =} strtok (@var{str})
  21 ## @deftypefnx {Function File} {[@var{tok}, @var{rem}] =} strtok (@var{str}, @var{delim})
  22 ##
  23 ## Find all characters in the string @var{str} up to, but not including, the
  24 ## first character which is in the string @var{delim}.  If @var{rem} is
  25 ## requested, it contains the remainder of the string, starting at the first
  26 ## delimiter.  Leading delimiters are ignored.  If @var{delim} is not
  27 ## specified, whitespace is assumed.  @var{str} may also be a cell array of
  28 ## strings in which case the function executes on every individual string
  29 ## and returns a cell array of tokens and remainders.
  30 ##
  31 ## Examples:
  32 ##
  33 ## @example
  34 ## @group
  35 ## strtok ("this is the life")
  36 ##      @result{} "this"
  37 ##
  38 ## [tok, rem] = strtok ("14*27+31", "+-*/")
  39 ##      @result{}
  40 ##         tok = 14
  41 ##         rem = *27+31
  42 ## @end group
  43 ## @end example
  44 ## @seealso{index, strsplit, strchr, isspace}
  45 ## @end deftypefn
  46
  47 function [tok, rem] = strtok (str, delim)
  48
  49   if (nargin < 1 || nargin > 2)
  50     print_usage ();
  51   elseif (! (ischar (str) || iscellstr (str)))
  52     error ("strtok: STR must be a string or cell array of strings.");
  53   elseif (ischar (str) && ! isvector (str) &&! isempty (str))
  54     error ("strtok: STR cannot be a 2-D character array.");
  55   endif
  56
  57   if (nargin < 2 || isempty (delim))
  58     ws_delim = true;
  59   else
  60     ws_delim = false;
  61   endif
  62
  63   if (isempty (str))
  64     tok = rem = "";
  65   elseif (ischar (str))
  66     if (ws_delim)
  67       idx = isspace (str);
  68     elseif (length (delim) <= 7)
  69       ## Build index of delimiters incrementally for low N.
  70       idx = str == delim(1);
  71       for i = 2:length (delim)
  72         idx |= str == delim(i);
  73       endfor
  74     else
  75       ## Index the str into a mask of valid values.  Faster for large N.
  76       f = false (256, 1);
  77       ## This is slower than it could be because of the +1 issue.
  78       f(uint8(delim)+1) = true;
  79       ## Default goes via double -- unnecessarily long.
  80       si = uint32 (str);
  81       ## in-place is faster than str+1
  82       ++si;
  83       idx = f(si);
  84     endif
  85
  86     idx_dlim = find (idx, 1);
  87     idx_nodlim = find (! idx, 1);
  88     if (isempty (idx_dlim))
  89       ## No delimiter.  Return whole string.
  90       tok = str;
  91       rem = "";
  92     elseif (idx_dlim > idx_nodlim)
  93       ## Normal case.  No leading delimiters and at least 1 delimiter in STR.
  94       tok = str(1:idx_dlim-1);
  95       rem = str(idx_dlim:end);
  96     else
  97       ## Leading delimiter found.
  98       idx_dlim = find (idx(idx_nodlim+1:end), 1);
  99       if (isempty (idx_dlim))
 100         ## No further delimiters.  Return STR stripped of delimiter prefix.
 101         tok = str(idx_nodlim:end);
 102         rem = "";
 103       else
 104         ## Strip delimiter prefix.  Return STR up to 1st delimiter
 105         tok = str(idx_nodlim:(idx_dlim + idx_nodlim -1));
 106         rem = str((idx_dlim + idx_nodlim):end);
 107       endif
 108     endif
 109   else    # Cell array of strings
 110     if (ws_delim)
 111       delim = '\s';
 112     endif
 113     ptn = [ '^[' delim ']*','([^' delim ']+)','([' delim '].*)$' ];
 114     matches = regexp (str, ptn, "tokens");
 115     eidx = cellfun ("isempty", matches);
 116     midx = ! eidx;
 117     tok = cell (size (str));
 118     tok(eidx) = regexprep (str(eidx), [ '^[' delim ']+' ], '');
 119     ## Unwrap doubly nested cell array from regexp
 120     tmp = [matches{midx}];
 121     if (! isempty (tmp))
 122       tmp = [tmp{:}];
 123     endif
 124     tok(midx) = tmp(1:2:end);
 125     if (isargout (2))
 126       rem = cell (size (str));
 127       rem(eidx) = {""};
 128       rem(midx) = tmp(2:2:end);
 129     endif
 130   endif
 131
 132 endfunction
 133
 134
 135 %!demo
 136 %! strtok("this is the life")
 137 %! % split at the first space, returning "this"
 138
 139 %!demo
 140 %! s = "14*27+31"
 141 %! while (1)
 142 %!   [t, s] = strtok (s, "+-*/");
 143 %!   printf ("<%s>", t);
 144 %!   if (isempty (s))
 145 %!     break;
 146 %!   endif
 147 %!   printf ("<%s>", s(1));
 148 %! endwhile
 149 %! printf("\n");
 150 %! % ----------------------------------------------------
 151 %! % Demonstrates processing of an entire string split on
 152 %! % a variety of delimiters.  Tokens and delimiters are
 153 %! % printed one after another in angle brackets.
 154
 155 %% Test the tokens for all cases
 156 %!assert (strtok (""), "");             # no string
 157 %!assert (strtok ("this"), "this");     # no delimiter in string
 158 %!assert (strtok ("this "), "this");    # delimiter at end
 159 %!assert (strtok ("this is"), "this");  # delimiter in middle
 160 %!assert (strtok (" this"), "this");    # delimiter at start
 161 %!assert (strtok (" this "), "this");   # delimiter at start and end
 162 %!assert (strtok (" "), ""(1:0));       # delimiter only
 163
 164 %% Test the remainder for all cases
 165 %!test [t,r] = strtok (""); assert (r, "");
 166 %!test [t,r] = strtok ("this"); assert (r, "");
 167 %!test [t,r] = strtok ("this "); assert (r, " ");
 168 %!test [t,r] = strtok ("this is"); assert (r, " is");
 169 %!test [t,r] = strtok (" this"); assert (r, "");
 170 %!test [t,r] = strtok (" this "); assert (r, " ");
 171 %!test [t,r] = strtok (" "); assert (r, "");
 172
 173 %% Test all tokens and remainders with cell array input
 174 %!test
 175 %! str = {"", "this", "this ", "this is", " this", " this ", " "};
 176 %! [t, r] = strtok (str);
 177 %! assert (t{1}, "");
 178 %! assert (r{1}, "");
 179 %! assert (t{2}, "this");
 180 %! assert (r{2}, "");
 181 %! assert (t{3}, "this");
 182 %! assert (r{3}, " ");
 183 %! assert (t{4}, "this");
 184 %! assert (r{4}, " is");
 185 %! assert (t{5}, "this");
 186 %! assert (r{5}, "");
 187 %! assert (t{6}, "this");
 188 %! assert (r{6}, " ");
 189 %! assert (t{7}, "");
 190 %! assert (r{7}, "");
 191
 192 %% Simple check for 2, 3, and 4 delimeters
 193 %!assert(strtok ("this is", "i "), "th");
 194 %!assert(strtok ("this is", "ij "), "th");
 195 %!assert(strtok ("this is", "ijk "), "th");
 196
 197 %% Test all cases for 8 delimiters since a different
 198 %!# algorithm is used when more than 7 delimiters
 199 %!assert (strtok ("","jklmnop "), "");
 200 %!assert (strtok ("this","jklmnop "), "this");
 201 %!assert (strtok ("this ","jklmnop "), "this");
 202 %!assert (strtok ("this is","jklmnop "), "this");
 203 %!assert (strtok (" this","jklmnop "), "this");
 204 %!assert (strtok (" this ","jklmnop "), "this");
 205 %!assert (strtok (" ","jklmnop "), ""(1:0));
 206
 207 %% Test 'bad' string orientations
 208 %!assert (strtok (" this ".'), "this".');   # delimiter at start and end
 209 %!assert (strtok (" this ".',"jkl "), "this".');
 210
 211 %% Test with TAB, LF, VT, FF, and CR
 212 %!test
 213 %! for ch = "\t\n\v\f\r"
 214 %!   [t, r] = strtok (cstrcat ("beg", ch, "end"));
 215 %!   assert (t, "beg");
 216 %!   assert (r, cstrcat (ch, "end"))
 217 %! endfor
 218
 219 %% Test input validation
 220 %!error strtok ()
 221 %!error strtok ("a", "b", "c")
 222 %!error <STR must be a string> strtok (1, "b")
 223 %!error <STR cannot be a 2-D> strtok (char ("hello", "world"), "l")
 224