1 ## Copyright (C) 2000-2012 Paul Kienzle
3 ## This file is part of Octave.
5 ## Octave is free software; you can redistribute it and/or modify it
6 ## under the terms of the GNU General Public License as published by
7 ## the Free Software Foundation; either version 3 of the License, or (at
8 ## your option) any later version.
10 ## Octave is distributed in the hope that it will be useful, but
11 ## WITHOUT ANY WARRANTY; without even the implied warranty of
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ## General Public License for more details.
15 ## You should have received a copy of the GNU General Public License
16 ## along with Octave; see the file COPYING. If not, see
17 ## <http://www.gnu.org/licenses/>.
20 ## @deftypefn {Function File} {} speed (@var{f}, @var{init}, @var{max_n}, @var{f2}, @var{tol})
21 ## @deftypefnx {Function File} {[@var{order}, @var{n}, @var{T_f}, @var{T_f2}] =} speed (@dots{})
23 ## Determine the execution time of an expression (@var{f}) for various input
24 ## values (@var{n}). The @var{n} are log-spaced from 1 to @var{max_n}. For
25 ## each @var{n}, an initialization expression (@var{init}) is computed to
26 ## create any data needed for the test. If a second expression (@var{f2}) is
27 ## given then the execution times of the two expressions are compared. When
28 ## called without output arguments the results are printed to stdout and
29 ## displayed graphically.
33 ## The code expression to evaluate.
36 ## The maximum test length to run. The default value is 100. Alternatively,
37 ## use @code{[min_n, max_n]} or specify the @var{n} exactly with
38 ## @code{[n1, n2, @dots{}, nk]}.
41 ## Initialization expression for function argument values. Use @var{k}
42 ## for the test number and @var{n} for the size of the test. This should
43 ## compute values for all variables used by @var{f}. Note that @var{init} will
44 ## be evaluated first for @math{k = 0}, so things which are constant throughout
45 ## the test series can be computed once. The default value is
46 ## @code{@var{x} = randn (@var{n}, 1)}.
49 ## An alternative expression to evaluate, so that the speed of two
50 ## expressions can be directly compared. The default is @code{[]}.
53 ## Tolerance used to compare the results of expression @var{f} and expression
54 ## @var{f2}. If @var{tol} is positive, the tolerance is an absolute one.
55 ## If @var{tol} is negative, the tolerance is a relative one. The default is
56 ## @code{eps}. If @var{tol} is @code{Inf}, then no comparison will be made.
59 ## The time complexity of the expression @math{O(a*n^p)}. This
60 ## is a structure with fields @code{a} and @code{p}.
63 ## The values @var{n} for which the expression was calculated @strong{AND}
64 ## the execution time was greater than zero.
67 ## The nonzero execution times recorded for the expression @var{f} in seconds.
70 ## The nonzero execution times recorded for the expression @var{f2} in seconds.
71 ## If required, the mean time ratio is simply @code{mean (T_f ./ T_f2)}.
75 ## The slope of the execution time graph shows the approximate
76 ## power of the asymptotic running time @math{O(n^p)}. This
77 ## power is plotted for the region over which it is approximated
78 ## (the latter half of the graph). The estimated power is not
79 ## very accurate, but should be sufficient to determine the
80 ## general order of an algorithm. It should indicate if, for
81 ## example, the implementation is unexpectedly @math{O(n^2)}
82 ## rather than @math{O(n)} because it extends a vector each
83 ## time through the loop rather than pre-allocating storage.
84 ## In the current version of Octave, the following is not the
85 ## expected @math{O(n)}.
88 ## speed ("for i = 1:n, y@{i@} = x(i); endfor", "", [1000, 10000])
92 ## But it is if you preallocate the cell array @code{y}:
96 ## speed ("for i = 1:n, y@{i@} = x(i); endfor", ...
97 ## "x = rand (n, 1); y = cell (size (x));", [1000, 10000])
101 ## An attempt is made to approximate the cost of individual
102 ## operations, but it is wildly inaccurate. You can improve the
103 ## stability somewhat by doing more work for each @code{n}. For
107 ## speed ("airy(x)", "x = rand (n, 10)", [10000, 100000])
110 ## When comparing two different expressions (@var{f}, @var{f2}), the slope
111 ## of the line on the speedup ratio graph should be larger than 1 if the new
112 ## expression is faster. Better algorithms have a shallow slope. Generally,
113 ## vectorizing an algorithm will not change the slope of the execution
114 ## time graph, but will shift it relative to the original. For
119 ## speed ("sum (x)", "", [10000, 100000], ...
120 ## "v = 0; for i = 1:length (x), v += x(i); endfor")
124 ## The following is a more complex example. If there was an original version
125 ## of @code{xcorr} using for loops and a second version using an FFT, then
126 ## one could compare the run speed for various lags as follows, or for a fixed
127 ## lag with varying vector lengths as follows:
131 ## speed ("xcorr (x, n)", "x = rand (128, 1);", 100,
132 ## "xcorr_orig (x, n)", -100*eps)
133 ## speed ("xcorr (x, 15)", "x = rand (20+n, 1);", 100,
134 ## "xcorr_orig (x, n)", -100*eps)
138 ## Assuming one of the two versions is in xcorr_orig, this
139 ## would compare their speed and their output values. Note that the
140 ## FFT version is not exact, so one must specify an acceptable tolerance on
141 ## the comparison @code{100*eps}. In this case, the comparison should be
142 ## computed relatively, as @code{abs ((@var{x} - @var{y}) ./ @var{y})} rather
143 ## than absolutely as @code{abs (@var{x} - @var{y})}.
145 ## Type @kbd{example ("speed")} to see some real examples or
146 ## @kbd{demo ("speed")} to run them.
149 ## FIXME: consider two dimensional speedup surfaces for functions like kron.
150 function [__order, __test_n, __tnew, __torig] = speed (__f1, __init, __max_n = 100, __f2 = "", __tol = eps)
152 if (nargin < 1 || nargin > 6)
156 if (nargin < 2 || isempty (__init))
157 __init = "x = randn (n, 1)";
160 if (isempty (__max_n))
166 ## Let user specify range of n.
167 if (isscalar (__max_n))
169 assert (__max_n > __min_n);
170 __test_n = logspace (0, log10 (__max_n), __numtests);
171 elseif (length (__max_n) == 2)
172 [__min_n, __max_n] = deal (__max_n(1), __max_n(2));
173 assert (__min_n >= 1);
174 assert (__max_n > __min_n);
175 __test_n = logspace (log10 (__min_n), log10 (__max_n), __numtests);
177 assert (all (__max_n > 0));
180 ## Force n to be an integer.
181 __test_n = unique (round (__test_n));
182 assert (__test_n >= 1);
184 __torig = __tnew = zeros (size (__test_n));
186 ## Print and plot the data if no output is requested.
187 do_display = (nargout == 0);
190 disp (cstrcat ("testing ", __f1, "\ninit: ", __init));
193 ## Add semicolon closure to all code fragments in case user has not done so.
194 __init = cstrcat (__init, ";");
195 __f1 = cstrcat (__f1, ";");
196 if (! isempty (__f2))
197 __f2 = cstrcat (__f2, ";");
200 ## Make sure the functions are freshly loaded by evaluating them at
201 ## test_n(1); first have to initialize the args though.
206 if (! isempty (__f2))
211 for k = 1:length (__test_n)
216 printf ("n%i = %i ", k, n);
220 eval (cstrcat ("__t = time();", __f1, "__v1=ans; __t = time()-__t;"));
222 eval (cstrcat ("__t2 = time();", __f1, "__t2 = time()-__t2;"));
223 eval (cstrcat ("__t3 = time();", __f1, "__t3 = time()-__t3;"));
224 __t = min ([__t, __t2, __t3]);
228 if (! isempty (__f2))
229 eval (cstrcat ("__t = time();", __f2, "__v2=ans; __t = time()-__t;"));
231 eval (cstrcat ("__t2 = time();", __f2, "__t2 = time()-__t2;"));
232 eval (cstrcat ("__t3 = time();", __f2, "__t3 = time()-__t3;"));
233 __t = min ([__t, __t2, __t3]);
237 assert (__v1, __v2, __tol);
243 ## Drop times of zero.
245 zidx = (__tnew < 100*eps);
249 zidx = (__tnew < 100*eps | __torig < 100*eps);
255 if (isempty (__test_n))
256 error (["speed: All running times were zero.\n",
257 "error: speed: Choose larger MAX_N or do more work per function evaluation"]);
260 ## Approximate time complexity and return it if requested.
261 tailidx = ceil (length (__test_n)/2):length (__test_n);
262 p = polyfit (log (__test_n(tailidx)), log (__tnew(tailidx)), 1);
265 __order.a = exp (p(2));
270 ## Strip semicolon added to code fragments before displaying
273 if (! isempty (__f2))
278 if (do_display && isempty (__f2))
280 loglog (__test_n, __tnew*1000, "*-g;execution time;");
281 xlabel ("test length");
282 ylabel ("best execution time (ms)");
283 title ({__f1, cstrcat("init: ", __init)});
288 semilogx (__test_n, __torig./__tnew,
289 cstrcat ("-*r;", strrep (__f1, ";", "."), " / ",
290 strrep (__f2, ";", "."), ";"),
291 __test_n, __tnew./__torig,
292 cstrcat ("-*g;", strrep (__f2, ";", "."), " / ",
293 strrep (__f1, ";", "."), ";"));
294 title ("Speedup Ratio");
295 xlabel ("test length");
296 ylabel ("speedup ratio");
299 loglog (__test_n, __tnew*1000,
300 cstrcat ("*-g;", strrep (__f1, ";", "."), ";"),
301 __test_n, __torig*1000,
302 cstrcat ("*-r;", strrep (__f2,";","."), ";"));
303 title ({"Execution Times", cstrcat("init: ", __init)});
304 xlabel ("test length");
305 ylabel ("best execution time (ms)");
307 ratio = mean (__torig ./ __tnew);
308 printf ("\n\nMean runtime ratio = %.3g for '%s' vs '%s'\n",
315 ## Plot time complexity approximation (using milliseconds).
316 figure; # Open second plot window
318 order = round (10*p(1))/10;
320 order = sprintf ("O(n^%g)", order);
324 v = polyval (p, log (__test_n(tailidx)));
326 loglog (__test_n(tailidx), exp(v)*1000, sprintf ("b;%s;", order));
327 title ({"Time Complexity", __f1});
328 xlabel ("test length");
330 ## Get base time to 1 digit of accuracy.
332 dt = floor (dt/10^floor(log10(dt)))*10^floor(log10(dt));
333 if (log10 (dt) >= -0.5)
334 time = sprintf ("%g s", dt);
335 elseif (log10 (dt) >= -3.5)
336 time = sprintf ("%g ms", dt*1e3);
337 elseif (log10 (dt) >= -6.5)
338 time = sprintf ("%g us", dt*1e6);
340 time = sprintf ("%g ns", dt*1e9);
343 ## Display nicely formatted complexity.
344 printf ("\nFor %s:\n", __f1);
345 printf (" asymptotic power: %s\n", order);
346 printf (" approximate time per operation: %s\n", time);
353 %% FIXME: Demos with declared functions do not work. See bug #31815.
354 %% A workaround has been hacked by not declaring the functions
355 %% but using eval to create them in the proper context.
356 %% Unfortunately, we can't remove them from the user's workspace
357 %% because of another bug (#34497).
359 %! fstr_build_orig = cstrcat (
360 %! "function x = build_orig (n)\n",
361 %! " ## extend the target vector on the fly\n",
362 %! " for i=0:n-1, x([1:100]+i*100) = 1:100; endfor\n",
364 %! fstr_build = cstrcat (
365 %! "function x = build (n)\n",
366 %! " ## preallocate the target vector\n",
367 %! " x = zeros (1, n*100);\n",
368 %! " for i=0:n-1, x([1:100]+i*100) = 1:100; endfor\n",
371 %! disp ("-----------------------");
372 %! disp (fstr_build_orig);
373 %! disp ("-----------------------");
374 %! disp (fstr_build);
375 %! disp ("-----------------------");
377 %! ## Eval functions strings to create them in the current context
378 %! eval (fstr_build_orig);
379 %! eval (fstr_build);
381 %! disp ("Preallocated vector test.\nThis takes a little while...");
382 %! speed("build (n)", "", 1000, "build_orig (n)");
383 %! clear -f build build_orig
384 %! disp ("Note how much faster it is to pre-allocate a vector.");
385 %! disp ("Notice the peak speedup ratio.");
388 %! fstr_build_orig = cstrcat (
389 %! "function x = build_orig (n)\n",
390 %! " for i=0:n-1, x([1:100]+i*100) = 1:100; endfor\n",
392 %! fstr_build = cstrcat (
393 %! "function x = build (n)\n",
394 %! " idx = [1:100]';\n",
395 %! " x = idx(:,ones(1,n));\n",
396 %! " x = reshape (x, 1, n*100);\n",
399 %! disp ("-----------------------");
400 %! disp (fstr_build_orig);
401 %! disp ("-----------------------");
402 %! disp (fstr_build);
403 %! disp ("-----------------------");
405 %! ## Eval functions strings to create them in the current context
406 %! eval (fstr_build_orig);
407 %! eval (fstr_build);
409 %! disp ("Vectorized test.\nThis takes a little while...");
410 %! speed("build (n)", "", 1000, "build_orig (n)");
411 %! clear -f build build_orig
412 %! disp ("-----------------------");
413 %! disp ("This time, the for loop is done away with entirely.");
414 %! disp ("Notice how much bigger the speedup is than in example 1.");
417 %! [order, n, T_f1, T_f2] = speed ("airy (x)", "x = rand (n, 10)", [100, 1000]);
418 %! assert (isstruct (order));
419 %! assert (size (order), [1, 1]);
420 %! assert (fieldnames (order), {"p"; "a"});
421 %! assert (isnumeric (n));
422 %! assert (length (n) > 10);
423 %! assert (isnumeric (T_f1));
424 %! assert (size (T_f1), size (n));
425 %! assert (isnumeric (T_f2));
426 %! assert (length (T_f2) > 10);
428 %% This test is known to fail on operating systems with low resolution timers such as MinGW
430 %! [order, n, T_f1, T_f2] = speed ("sum (x)", "", [100, 1000], "v = 0; for i = 1:length (x), v += x(i); endfor");
431 %! assert (isstruct (order));
432 %! assert (size (order), [1, 1]);
433 %! assert (fieldnames (order), {"p"; "a"});
434 %! assert (isnumeric (n));
435 %! assert (length (n) > 10);
436 %! assert (isnumeric (T_f1));
437 %! assert (size (T_f1), size (n));
438 %! assert (isnumeric (T_f2));
439 %! assert (length (T_f2) > 10);
441 %% Test input validation
443 %!error speed (1, 2, 3, 4, 5, 6, 7);