octave_packages/m/testfun/speed.m

   1 ## Copyright (C) 2000-2012 Paul Kienzle
   2 ##
   3 ## This file is part of Octave.
   4 ##
   5 ## Octave is free software; you can redistribute it and/or modify it
   6 ## under the terms of the GNU General Public License as published by
   7 ## the Free Software Foundation; either version 3 of the License, or (at
   8 ## your option) any later version.
   9 ##
  10 ## Octave is distributed in the hope that it will be useful, but
  11 ## WITHOUT ANY WARRANTY; without even the implied warranty of
  12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 ## General Public License for more details.
  14 ##
  15 ## You should have received a copy of the GNU General Public License
  16 ## along with Octave; see the file COPYING.  If not, see
  17 ## <http://www.gnu.org/licenses/>.
  18
  19 ## -*- texinfo -*-
  20 ## @deftypefn  {Function File} {} speed (@var{f}, @var{init}, @var{max_n}, @var{f2}, @var{tol})
  21 ## @deftypefnx {Function File} {[@var{order}, @var{n}, @var{T_f}, @var{T_f2}] =} speed (@dots{})
  22 ##
  23 ## Determine the execution time of an expression (@var{f}) for various input
  24 ## values (@var{n}).  The @var{n} are log-spaced from 1 to @var{max_n}.  For
  25 ## each @var{n}, an initialization expression (@var{init}) is computed to
  26 ## create any data needed for the test.  If a second expression (@var{f2}) is
  27 ## given then the execution times of the two expressions are compared.  When
  28 ## called without output arguments the results are printed to stdout and
  29 ## displayed graphically.
  30 ##
  31 ## @table @code
  32 ## @item @var{f}
  33 ## The code expression to evaluate.
  34 ##
  35 ## @item @var{max_n}
  36 ## The maximum test length to run.  The default value is 100.  Alternatively,
  37 ## use @code{[min_n, max_n]} or specify the @var{n} exactly with
  38 ## @code{[n1, n2, @dots{}, nk]}.
  39 ##
  40 ## @item @var{init}
  41 ## Initialization expression for function argument values.  Use @var{k}
  42 ## for the test number and @var{n} for the size of the test.  This should
  43 ## compute values for all variables used by @var{f}.  Note that @var{init} will
  44 ## be evaluated first for @math{k = 0}, so things which are constant throughout
  45 ## the test series can be computed once.  The default value is
  46 ## @code{@var{x} = randn (@var{n}, 1)}.
  47 ##
  48 ## @item @var{f2}
  49 ## An alternative expression to evaluate, so that the speed of two
  50 ## expressions can be directly compared.  The default is @code{[]}.
  51 ##
  52 ## @item @var{tol}
  53 ## Tolerance used to compare the results of expression @var{f} and expression
  54 ## @var{f2}.  If @var{tol} is positive, the tolerance is an absolute one.
  55 ## If @var{tol} is negative, the tolerance is a relative one.  The default is
  56 ## @code{eps}.  If @var{tol} is @code{Inf}, then no comparison will be made.
  57 ##
  58 ## @item @var{order}
  59 ## The time complexity of the expression @math{O(a*n^p)}.  This
  60 ## is a structure with fields @code{a} and @code{p}.
  61 ##
  62 ## @item @var{n}
  63 ## The values @var{n} for which the expression was calculated @strong{AND}
  64 ## the execution time was greater than zero.
  65 ##
  66 ## @item @var{T_f}
  67 ## The nonzero execution times recorded for the expression @var{f} in seconds.
  68 ##
  69 ## @item @var{T_f2}
  70 ## The nonzero execution times recorded for the expression @var{f2} in seconds.
  71 ## If required, the mean time ratio is simply @code{mean (T_f ./ T_f2)}.
  72 ##
  73 ## @end table
  74 ##
  75 ## The slope of the execution time graph shows the approximate
  76 ## power of the asymptotic running time @math{O(n^p)}.  This
  77 ## power is plotted for the region over which it is approximated
  78 ## (the latter half of the graph).  The estimated power is not
  79 ## very accurate, but should be sufficient to determine the
  80 ## general order of an algorithm.  It should indicate if, for
  81 ## example, the implementation is unexpectedly @math{O(n^2)}
  82 ## rather than @math{O(n)} because it extends a vector each
  83 ## time through the loop rather than pre-allocating storage.
  84 ## In the current version of Octave, the following is not the
  85 ## expected @math{O(n)}.
  86 ##
  87 ## @example
  88 ## speed ("for i = 1:n, y@{i@} = x(i); endfor", "", [1000, 10000])
  89 ## @end example
  90 ##
  91 ## @noindent
  92 ## But it is if you preallocate the cell array @code{y}:
  93 ##
  94 ## @example
  95 ## @group
  96 ## speed ("for i = 1:n, y@{i@} = x(i); endfor", ...
  97 ##        "x = rand (n, 1); y = cell (size (x));", [1000, 10000])
  98 ## @end group
  99 ## @end example
 100 ##
 101 ## An attempt is made to approximate the cost of individual
 102 ## operations, but it is wildly inaccurate.  You can improve the
 103 ## stability somewhat by doing more work for each @code{n}.  For
 104 ## example:
 105 ##
 106 ## @example
 107 ## speed ("airy(x)", "x = rand (n, 10)", [10000, 100000])
 108 ## @end example
 109 ##
 110 ## When comparing two different expressions (@var{f}, @var{f2}), the slope
 111 ## of the line on the speedup ratio graph should be larger than 1 if the new
 112 ## expression is faster.  Better algorithms have a shallow slope.  Generally,
 113 ## vectorizing an algorithm will not change the slope of the execution
 114 ## time graph, but will shift it relative to the original.  For
 115 ## example:
 116 ##
 117 ## @example
 118 ## @group
 119 ## speed ("sum (x)", "", [10000, 100000], ...
 120 ##        "v = 0; for i = 1:length (x), v += x(i); endfor")
 121 ## @end group
 122 ## @end example
 123 ##
 124 ## The following is a more complex example.  If there was an original version
 125 ## of @code{xcorr} using for loops and a second version using an FFT, then
 126 ## one could compare the run speed for various lags as follows, or for a fixed
 127 ## lag with varying vector lengths as follows:
 128 ##
 129 ## @example
 130 ## @group
 131 ## speed ("xcorr (x, n)", "x = rand (128, 1);", 100,
 132 ##        "xcorr_orig (x, n)", -100*eps)
 133 ## speed ("xcorr (x, 15)", "x = rand (20+n, 1);", 100,
 134 ##        "xcorr_orig (x, n)", -100*eps)
 135 ## @end group
 136 ## @end example
 137 ##
 138 ## Assuming one of the two versions is in xcorr_orig, this
 139 ## would compare their speed and their output values.  Note that the
 140 ## FFT version is not exact, so one must specify an acceptable tolerance on
 141 ## the comparison @code{100*eps}.  In this case, the comparison should be
 142 ## computed relatively, as @code{abs ((@var{x} - @var{y}) ./ @var{y})} rather
 143 ## than absolutely as @code{abs (@var{x} - @var{y})}.
 144 ##
 145 ## Type @kbd{example ("speed")} to see some real examples or
 146 ## @kbd{demo ("speed")} to run them.
 147 ## @end deftypefn
 148
 149 ## FIXME: consider two dimensional speedup surfaces for functions like kron.
 150 function [__order, __test_n, __tnew, __torig] = speed (__f1, __init, __max_n = 100, __f2 = "", __tol = eps)
 151
 152   if (nargin < 1 || nargin > 6)
 153     print_usage ();
 154   endif
 155
 156   if (nargin < 2 || isempty (__init))
 157     __init = "x = randn (n, 1)";
 158   endif
 159
 160   if (isempty (__max_n))
 161     __max_n = 100;
 162   endif
 163
 164   __numtests = 15;
 165
 166   ## Let user specify range of n.
 167   if (isscalar (__max_n))
 168     __min_n = 1;
 169     assert (__max_n > __min_n);
 170     __test_n = logspace (0, log10 (__max_n), __numtests);
 171   elseif (length (__max_n) == 2)
 172     [__min_n, __max_n] = deal (__max_n(1), __max_n(2));
 173     assert (__min_n >= 1);
 174     assert (__max_n > __min_n);
 175     __test_n = logspace (log10 (__min_n), log10 (__max_n), __numtests);
 176   else
 177     assert (all (__max_n > 0));
 178     __test_n = __max_n;
 179   endif
 180   ## Force n to be an integer.
 181   __test_n = unique (round (__test_n));
 182   assert (__test_n >= 1);
 183
 184   __torig = __tnew = zeros (size (__test_n));
 185
 186   ## Print and plot the data if no output is requested.
 187   do_display = (nargout == 0);
 188
 189   if (do_display)
 190     disp (cstrcat ("testing ", __f1, "\ninit: ", __init));
 191   endif
 192
 193   ## Add semicolon closure to all code fragments in case user has not done so.
 194   __init = cstrcat (__init, ";");
 195   __f1 = cstrcat (__f1, ";");
 196   if (! isempty (__f2))
 197     __f2 = cstrcat (__f2, ";");
 198   endif
 199
 200   ## Make sure the functions are freshly loaded by evaluating them at
 201   ## test_n(1); first have to initialize the args though.
 202   n = 1;
 203   k = 0;
 204   eval (__init);
 205   eval (__f1);
 206   if (! isempty (__f2))
 207     eval (__f2);
 208   endif
 209
 210   ## Run the tests.
 211   for k = 1:length (__test_n)
 212     n = __test_n(k);
 213     eval (__init);
 214
 215     if (do_display)
 216       printf ("n%i = %i  ", k, n);
 217       fflush (stdout);
 218     endif
 219
 220     eval (cstrcat ("__t = time();", __f1, "__v1=ans; __t = time()-__t;"));
 221     if (__t < 0.25)
 222       eval (cstrcat ("__t2 = time();", __f1, "__t2 = time()-__t2;"));
 223       eval (cstrcat ("__t3 = time();", __f1, "__t3 = time()-__t3;"));
 224       __t = min ([__t, __t2, __t3]);
 225     endif
 226     __tnew(k) = __t;
 227
 228     if (! isempty (__f2))
 229       eval (cstrcat ("__t = time();", __f2, "__v2=ans; __t = time()-__t;"));
 230       if (__t < 0.25)
 231         eval (cstrcat ("__t2 = time();", __f2, "__t2 = time()-__t2;"));
 232         eval (cstrcat ("__t3 = time();", __f2, "__t3 = time()-__t3;"));
 233         __t = min ([__t, __t2, __t3]);
 234       endif
 235       __torig(k) = __t;
 236       if (! isinf(__tol))
 237         assert (__v1, __v2, __tol);
 238       endif
 239     endif
 240
 241   endfor
 242
 243   ## Drop times of zero.
 244   if (isempty (__f2))
 245     zidx = (__tnew < 100*eps);
 246     __test_n(zidx) = [];
 247     __tnew(zidx) = [];
 248   else
 249     zidx = (__tnew < 100*eps | __torig < 100*eps);
 250     __test_n(zidx) = [];
 251     __tnew(zidx) = [];
 252     __torig(zidx) = [];
 253   endif
 254
 255   if (isempty (__test_n))
 256     error (["speed: All running times were zero.\n",
 257             "error: speed: Choose larger MAX_N or do more work per function evaluation"]);
 258   endif
 259
 260   ## Approximate time complexity and return it if requested.
 261   tailidx = ceil (length (__test_n)/2):length (__test_n);
 262   p = polyfit (log (__test_n(tailidx)), log (__tnew(tailidx)), 1);
 263   if (nargout > 0)
 264     __order.p = p(1);
 265     __order.a = exp (p(2));
 266   endif
 267
 268   if (do_display)
 269     figure;
 270     ## Strip semicolon added to code fragments before displaying
 271     __init(end) = "";
 272     __f1(end) = "";
 273     if (! isempty (__f2))
 274       __f2(end) = "";
 275     endif
 276   endif
 277
 278   if (do_display && isempty (__f2))
 279
 280     loglog (__test_n, __tnew*1000, "*-g;execution time;");
 281     xlabel ("test length");
 282     ylabel ("best execution time (ms)");
 283     title ({__f1, cstrcat("init: ", __init)});
 284
 285   elseif (do_display)
 286
 287     subplot (1, 2, 1);
 288     semilogx (__test_n, __torig./__tnew,
 289               cstrcat ("-*r;", strrep (__f1, ";", "."), " / ",
 290                        strrep (__f2, ";", "."), ";"),
 291                __test_n, __tnew./__torig,
 292               cstrcat ("-*g;", strrep (__f2, ";", "."), " / ",
 293                        strrep (__f1, ";", "."), ";"));
 294     title ("Speedup Ratio");
 295     xlabel ("test length");
 296     ylabel ("speedup ratio");
 297
 298     subplot (1, 2, 2);
 299     loglog (__test_n, __tnew*1000,
 300             cstrcat ("*-g;", strrep (__f1, ";", "."), ";"),
 301             __test_n, __torig*1000,
 302             cstrcat ("*-r;", strrep (__f2,";","."), ";"));
 303     title ({"Execution Times", cstrcat("init: ", __init)});
 304     xlabel ("test length");
 305     ylabel ("best execution time (ms)");
 306
 307     ratio = mean (__torig ./ __tnew);
 308     printf ("\n\nMean runtime ratio = %.3g for '%s' vs '%s'\n",
 309             ratio, __f2, __f1);
 310
 311   endif
 312
 313   if (do_display)
 314
 315     ## Plot time complexity approximation (using milliseconds).
 316     figure;   # Open second plot window
 317
 318     order = round (10*p(1))/10;
 319     if (order >= 0.1)
 320       order = sprintf ("O(n^%g)", order);
 321     else
 322       order = "O(1)";
 323     endif
 324     v = polyval (p, log (__test_n(tailidx)));
 325
 326     loglog (__test_n(tailidx), exp(v)*1000, sprintf ("b;%s;", order));
 327     title ({"Time Complexity", __f1});
 328     xlabel ("test length");
 329
 330     ## Get base time to 1 digit of accuracy.
 331     dt = exp (p(2));
 332     dt = floor (dt/10^floor(log10(dt)))*10^floor(log10(dt));
 333     if (log10 (dt) >= -0.5)
 334       time = sprintf ("%g s", dt);
 335     elseif (log10 (dt) >= -3.5)
 336       time = sprintf ("%g ms", dt*1e3);
 337     elseif (log10 (dt) >= -6.5)
 338       time = sprintf ("%g us", dt*1e6);
 339     else
 340       time = sprintf ("%g ns", dt*1e9);
 341     endif
 342
 343     ## Display nicely formatted complexity.
 344     printf ("\nFor %s:\n", __f1);
 345     printf ("  asymptotic power: %s\n", order);
 346     printf ("  approximate time per operation: %s\n", time);
 347
 348   endif
 349
 350 endfunction
 351
 352
 353 %% FIXME: Demos with declared functions do not work.  See bug #31815.
 354 %%        A workaround has been hacked by not declaring the functions
 355 %%        but using eval to create them in the proper context.
 356 %%        Unfortunately, we can't remove them from the user's workspace
 357 %%        because of another bug (#34497).
 358 %!demo
 359 %!  fstr_build_orig = cstrcat (
 360 %!  "function x = build_orig (n)\n",
 361 %!  "  ## extend the target vector on the fly\n",
 362 %!  "  for i=0:n-1, x([1:100]+i*100) = 1:100; endfor\n",
 363 %!  "endfunction");
 364 %!  fstr_build = cstrcat (
 365 %!  "function x = build (n)\n",
 366 %!  "  ## preallocate the target vector\n",
 367 %!  "  x = zeros (1, n*100);\n",
 368 %!  "  for i=0:n-1, x([1:100]+i*100) = 1:100; endfor\n",
 369 %!  "endfunction");
 370 %!
 371 %!  disp ("-----------------------");
 372 %!  disp (fstr_build_orig);
 373 %!  disp ("-----------------------");
 374 %!  disp (fstr_build);
 375 %!  disp ("-----------------------");
 376 %!
 377 %!  ## Eval functions strings to create them in the current context
 378 %!  eval (fstr_build_orig);
 379 %!  eval (fstr_build);
 380 %!
 381 %!  disp ("Preallocated vector test.\nThis takes a little while...");
 382 %!  speed("build (n)", "", 1000, "build_orig (n)");
 383 %!  clear -f build build_orig
 384 %!  disp ("Note how much faster it is to pre-allocate a vector.");
 385 %!  disp ("Notice the peak speedup ratio.");
 386
 387 %!demo
 388 %!  fstr_build_orig = cstrcat (
 389 %!  "function x = build_orig (n)\n",
 390 %!  "  for i=0:n-1, x([1:100]+i*100) = 1:100; endfor\n",
 391 %!  "endfunction");
 392 %!  fstr_build = cstrcat (
 393 %!  "function x = build (n)\n",
 394 %!  "  idx = [1:100]';\n",
 395 %!  "  x = idx(:,ones(1,n));\n",
 396 %!  "  x = reshape (x, 1, n*100);\n",
 397 %!  "endfunction");
 398 %!
 399 %!  disp ("-----------------------");
 400 %!  disp (fstr_build_orig);
 401 %!  disp ("-----------------------");
 402 %!  disp (fstr_build);
 403 %!  disp ("-----------------------");
 404 %!
 405 %!  ## Eval functions strings to create them in the current context
 406 %!  eval (fstr_build_orig);
 407 %!  eval (fstr_build);
 408 %!
 409 %!  disp ("Vectorized test.\nThis takes a little while...");
 410 %!  speed("build (n)", "", 1000, "build_orig (n)");
 411 %!  clear -f build build_orig
 412 %!  disp ("-----------------------");
 413 %!  disp ("This time, the for loop is done away with entirely.");
 414 %!  disp ("Notice how much bigger the speedup is than in example 1.");
 415
 416 %!test
 417 %! [order, n, T_f1, T_f2] = speed ("airy (x)", "x = rand (n, 10)", [100, 1000]);
 418 %! assert (isstruct (order));
 419 %! assert (size (order), [1, 1]);
 420 %! assert (fieldnames (order), {"p"; "a"});
 421 %! assert (isnumeric (n));
 422 %! assert (length (n) > 10);
 423 %! assert (isnumeric (T_f1));
 424 %! assert (size (T_f1), size (n));
 425 %! assert (isnumeric (T_f2));
 426 %! assert (length (T_f2) > 10);
 427
 428 %% This test is known to fail on operating systems with low resolution timers such as MinGW
 429 %!xtest
 430 %! [order, n, T_f1, T_f2] = speed ("sum (x)", "", [100, 1000], "v = 0; for i = 1:length (x), v += x(i); endfor");
 431 %! assert (isstruct (order));
 432 %! assert (size (order), [1, 1]);
 433 %! assert (fieldnames (order), {"p"; "a"});
 434 %! assert (isnumeric (n));
 435 %! assert (length (n) > 10);
 436 %! assert (isnumeric (T_f1));
 437 %! assert (size (T_f1), size (n));
 438 %! assert (isnumeric (T_f2));
 439 %! assert (length (T_f2) > 10);
 440
 441 %% Test input validation
 442 %!error speed ();
 443 %!error speed (1, 2, 3, 4, 5, 6, 7);
 444