1 ## Copyright (C) 2010-2012 Ben Abbott <bpabbott@mac.com>
3 ## This file is part of Octave.
5 ## Octave is free software; you can redistribute it and/or modify it
6 ## under the terms of the GNU General Public License as published by
7 ## the Free Software Foundation; either version 3 of the License, or (at
8 ## your option) any later version.
10 ## Octave is distributed in the hope that it will be useful, but
11 ## WITHOUT ANY WARRANTY; without even the implied warranty of
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ## General Public License for more details.
15 ## You should have received a copy of the GNU General Public License
16 ## along with Octave; see the file COPYING. If not, see
17 ## <http://www.gnu.org/licenses/>.
20 ## @deftypefn {Function File} {@var{C} =} textscan (@var{fid}, @var{format})
21 ## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{n})
22 ## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{param}, @var{value}, @dots{})
23 ## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{n}, @var{param}, @var{value}, @dots{})
24 ## @deftypefnx {Function File} {@var{C} =} textscan (@var{str}, @dots{})
25 ## @deftypefnx {Function File} {[@var{C}, @var{position}] =} textscan (@var{fid}, @dots{})
26 ## Read data from a text file or string.
28 ## The file associated with @var{fid} is read and parsed according to
29 ## @var{format}. The function behaves like @code{strread} except it works by
30 ## parsing a file instead of a string. See the documentation of
31 ## @code{strread} for details.
33 ## In addition to the options supported by
34 ## @code{strread}, this function supports a few more:
37 ## @item "collectoutput":
38 ## A value of 1 or true instructs textscan to concatenate consecutive columns
39 ## of the same class in the output cell array. A value of 0 or false (default)
40 ## leaves output in distinct columns.
43 ## Specify "\r", "\n" or "\r\n" (for CR, LF, or CRLF). If no value is given,
44 ## it will be inferred from the file. If set to "" (empty string) EOLs are
45 ## ignored as delimiters and added to whitespace.
47 ## @item "headerlines":
48 ## The first @var{value} number of lines of @var{fid} are skipped.
50 ## @item "returnonerror":
51 ## If set to numerical 1 or true (default), return normally when read errors
52 ## have been encountered. If set to 0 or false, return an error and no data.
55 ## The optional input @var{n} specifes the number of times to use
56 ## @var{format} when parsing, i.e., the format repeat count.
58 ## The output @var{C} is a cell array whose length is given by the number
59 ## of format specifiers.
61 ## The second output, @var{position}, provides the position, in characters,
62 ## from the beginning of the file.
64 ## @seealso{dlmread, fscanf, load, strread, textread}
67 function [C, position] = textscan (fid, format = "%f", varargin)
78 if (! (isa (fid, "double") && fid > 0) && ! ischar (fid))
79 error ("textscan: first argument must be a file id or character string");
82 if (! ischar (format))
83 error ("textscan: FORMAT must be a string");
87 if (nargin > 2 && isnumeric (args{1}))
93 if (! any (strcmpi (args, "emptyvalue")))
94 ## Matlab returns NaNs for missing values
95 args(end+1:end+2) = {'emptyvalue', NaN};
98 ## Check default parameter values that differ for strread & textread
100 ipos = find (strcmpi (args, "whitespace"));
102 ## Matlab default whitespace = " \b\t"
103 args(end+1:end+2) = {'whitespace', " \b\t"};
104 whitespace = " \b\t";
106 ## Check if there's at least one string format specifier
107 fmt = strrep (format, "%", " %");
108 fmt = regexp (fmt, '[^ ]+', 'match');
109 fmt = strtrim (fmt(strmatch ("%", fmt)))
110 has_str_fmt = all (cellfun ("isempty", strfind (strtrim (fmt(strmatch ("%", fmt))), 's')));
111 ## If there is a format, AND whitespace value = empty,
112 ## don't add a space (char(32)) to whitespace
113 if (! (isempty (args{ipos+1}) && has_str_fmt))
114 args{ipos+1} = unique ([" ", whitespace]);
118 if (! any (strcmpi (args, "delimiter")))
119 ## Matlab says default delimiter = whitespace.
120 ## strread() will pick this up further
121 args(end+1:end+2) = {'delimiter', ""};
125 ipos = find (strcmpi (args, "collectoutput"));
126 if (! isempty (ipos))
127 ## Search & concatenate consecutive columns of same class requested
128 if (isscalar (args{ipos+1})
129 && (islogical (args{ipos+1}) || isnumeric (args{ipos+1})))
130 collop = args{ipos+1};
132 warning ("textscan: illegal value for CollectOutput parameter - ignored");
134 ## Remove argument before call to strread() below
135 args(ipos:ipos+1) = [];
138 if (any (strcmpi (args, "returnonerror")))
139 ## Because of the way strread() reads data (columnwise) this parameter
140 ## can't be neatly implemented. strread() will pick it up anyway
141 warning ('textscan: ReturnOnError is not fully implemented');
143 ## Set default value (=true)
144 args(end+1:end+2) = {"returnonerror", 1};
148 ## Read from a text string
150 error ("textscan: cannot provide position information for character input");
154 ## Skip header lines if requested
155 headerlines = find (strcmpi (args, "headerlines"), 1);
156 ## Beware of zero valued headerline, fskipl would skip to EOF
157 if (! isempty (headerlines) && (args{headerlines + 1} > 0))
158 fskipl (fid, varargin{headerlines + 1});
159 args(headerlines:headerlines+1) = [];
161 if (isfinite (nlines) && (nlines >= 0))
164 ## FIXME: Can this be done without slow loop?
165 while (ischar (tmp_str) && n++ < nlines)
166 tmp_str = fgets (fid);
167 if (ischar (tmp_str))
168 str = strcat (str, tmp_str);
172 str = fread (fid, "char=>char").';
176 ## Check for empty result
178 warning ("textscan: no data read");
183 ## Check value of 'endofline'. String or file doesn't seem to matter
184 endofline = find (strcmpi (args, "endofline"), 1);
185 if (! isempty (endofline))
186 if (ischar (args{endofline + 1}))
187 eol_char = args{endofline + 1};
188 if (isempty (strmatch (eol_char, {"", "\n", "\r", "\r\n"}, 'exact')))
189 error ("textscan: illegal EndOfLine character value specified");
192 error ("textscan: character value required for EndOfLine");
195 ## Determine EOL from file. Search for EOL candidates in first 3000 chars
196 eol_srch_len = min (length (str), 3000);
197 ## First try DOS (CRLF)
198 if (! isempty (findstr ("\r\n", str(1 : eol_srch_len))))
200 ## Perhaps old Macintosh? (CR)
201 elseif (! isempty (findstr ("\r", str(1 : eol_srch_len))))
203 ## Otherwise, use plain UNIX (LF)
207 ## Set up the default endofline param value
208 args(end+1:end+2) = {'endofline', eol_char};
211 ## Determine the number of data fields
212 num_fields = numel (strfind (format, "%")) - numel (strfind (format, "%*"));
214 ## Strip trailing EOL to avoid returning stray missing values (f. strread)
215 if (strcmp (str(end-length (eol_char) + 1 : end), eol_char));
216 str(end-length (eol_char) + 1 : end) = "";
219 ## Call strread to make it do the real work
220 C = cell (1, num_fields);
221 [C{:}] = strread (str, format, args{:});
223 ## If requested, collect output columns of same class
229 position = ftell (fid);
235 ## Collect consecutive columns of same class into one cell column
236 function C = colloutp (C)
238 ## Start at rightmost column and work backwards to avoid ptr mixup
241 clss1 = class (C{ii});
243 while (jj > 1 && strcmp (clss1, class (C{jj - 1})))
244 ## Column to the left is still same class; check next column to the left
248 ## Concatenate columns into current column
249 C{jj} = [C{jj : ii}];
250 ## Wipe concatenated columns to the right, resume search to the left
254 ## No similar class in column to the left, search from there
262 %! str = "1, 2, 3, 4\n 5, , , 8\n 9, 10, 11, 12";
263 %! fmtstr = "%f %d %f %s";
264 %! c = textscan (str, fmtstr, 2, "delimiter", ",", "emptyvalue", -Inf);
265 %! assert (isequal (c{1}, [1;5]));
266 %! assert (length (c{1}), 2);
267 %! assert (iscellstr (c{4}));
268 %! assert (isequal (c{3}, [3; -Inf]));
273 %! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b);
274 %! fmt = "%f miles/hr = %f kilometers/hr";
275 %! c = textscan (str, fmt);
276 %! assert (b(1,:)', c{1}, 1e-5);
277 %! assert (b(2,:)', c{2}, 1e-5);
280 #%! str = "13, 72, NA, str1, 25\r\n// Middle line\r\n36, na, 05, str3, 6";
281 #%! a = textscan(str, '%d %n %f %s %n', 'delimiter', ',','treatAsEmpty', {'NA', 'na'},'commentStyle', '//');
282 #%! assert (a{1}, int32([13; 36]));
283 #%! assert (a{2}, [72; NaN]);
284 #%! assert (a{3}, [NaN; 5]);
285 #%! assert (a{4}, {"str1"; "str3"});
286 #%! assert (a{5}, [25; 6]);
289 %! str = "Km:10 = hhhBjjj miles16hour\r\n";
290 %! str = [str "Km:15 = hhhJjjj miles241hour\r\n"];
291 %! str = [str "Km:2 = hhhRjjj miles3hour\r\n"];
292 %! str = [str "Km:25 = hhhZ\r\n"];
293 %! fmt = "Km:%d = hhh%1sjjj miles%dhour";
294 %! a = textscan (str, fmt, 'delimiter', ' ');
295 %! assert (a{1}', int32([10 15 2 25]));
296 %! assert (a{2}', {'B' 'J' 'R' 'Z'});
297 %! assert (a{3}', int32([16 241 3 0]));
299 %% Test with default endofline parameter
301 %! c = textscan ("L1\nL2", "%s");
302 %! assert (c{:}, {"L1"; "L2"});
304 %% Test with endofline parameter set to '' (empty) - newline should be in word
306 %! c = textscan ("L1\nL2", "%s", 'endofline', '');
307 %! assert (int8(c{:}{:}), int8([ 76, 49, 10, 76, 50 ]));
310 %! # No delimiters at all besides EOL. Skip fields, even empty fields
311 %! str = "Text1Text2Text\nTextText4Text\nText57Text";
312 %! c = textscan (str, "Text%*dText%dText");
313 %! assert (c{1}, int32 ([2; 4; 0]));
316 %% CollectOutput test
318 %! b = [b; 8*b/5; 8*b*1000/5];
319 %! str = sprintf ("%g miles/hr = %g (%g) kilometers (meters)/hr\n", b);
320 %! fmt = "%f miles%s %s %f (%f) kilometers %*s";
321 %! c = textscan (str, fmt, 'collectoutput', 1);
322 %! assert (size(c{3}), [10, 2]);
323 %! assert (size(c{2}), [10, 2]);
325 %% Test input validation
327 %!error textscan (single (4))
328 %!error textscan ({4})
329 %!error <must be a string> textscan ("Hello World", 2)
330 %!error <cannot provide position information> [C, pos] = textscan ("Hello World")
331 %!error <character value required> textscan ("Hello World", '%s', 'EndOfLine', 3)