octave_packages/m/optimization/fminunc.m

   1 ## Copyright (C) 2008-2012 VZLU Prague, a.s.
   2 ##
   3 ## This file is part of Octave.
   4 ##
   5 ## Octave is free software; you can redistribute it and/or modify it
   6 ## under the terms of the GNU General Public License as published by
   7 ## the Free Software Foundation; either version 3 of the License, or (at
   8 ## your option) any later version.
   9 ##
  10 ## Octave is distributed in the hope that it will be useful, but
  11 ## WITHOUT ANY WARRANTY; without even the implied warranty of
  12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 ## General Public License for more details.
  14 ##
  15 ## You should have received a copy of the GNU General Public License
  16 ## along with Octave; see the file COPYING.  If not, see
  17 ## <http://www.gnu.org/licenses/>.
  18 ##
  19 ## Author: Jaroslav Hajek <highegg@gmail.com>
  20
  21 ## -*- texinfo -*-
  22 ## @deftypefn  {Function File} {} fminunc (@var{fcn}, @var{x0})
  23 ## @deftypefnx {Function File} {} fminunc (@var{fcn}, @var{x0}, @var{options})
  24 ## @deftypefnx {Function File} {[@var{x}, @var{fvec}, @var{info}, @var{output}, @var{grad}, @var{hess}] =} fminunc (@var{fcn}, @dots{})
  25 ## Solve an unconstrained optimization problem defined by the function
  26 ## @var{fcn}.
  27 ## @var{fcn} should accepts a vector (array) defining the unknown variables,
  28 ## and return the objective function value, optionally with gradient.
  29 ## In other words, this function attempts to determine a vector @var{x} such
  30 ## that @code{@var{fcn} (@var{x})} is a local minimum.
  31 ## @var{x0} determines a starting guess.  The shape of @var{x0} is preserved
  32 ## in all calls to @var{fcn}, but otherwise is treated as a column vector.
  33 ## @var{options} is a structure specifying additional options.
  34 ## Currently, @code{fminunc} recognizes these options:
  35 ## @code{"FunValCheck"}, @code{"OutputFcn"}, @code{"TolX"},
  36 ## @code{"TolFun"}, @code{"MaxIter"}, @code{"MaxFunEvals"},
  37 ## @code{"GradObj"}, @code{"FinDiffType"},
  38 ## @code{"TypicalX"}, @code{"AutoScaling"}.
  39 ##
  40 ## If @code{"GradObj"} is @code{"on"}, it specifies that @var{fcn},
  41 ## called with 2 output arguments, also returns the Jacobian matrix
  42 ## of right-hand sides at the requested point.  @code{"TolX"} specifies
  43 ## the termination tolerance in the unknown variables, while
  44 ## @code{"TolFun"} is a tolerance for equations.  Default is @code{1e-7}
  45 ## for both @code{"TolX"} and @code{"TolFun"}.
  46 ##
  47 ## For description of the other options, see @code{optimset}.
  48 ##
  49 ## On return, @var{fval} contains the value of the function @var{fcn}
  50 ## evaluated at @var{x}, and @var{info} may be one of the following values:
  51 ##
  52 ## @table @asis
  53 ## @item 1
  54 ## Converged to a solution point.  Relative gradient error is less than
  55 ## specified
  56 ## by TolFun.
  57 ##
  58 ## @item 2
  59 ## Last relative step size was less that TolX.
  60 ##
  61 ## @item 3
  62 ## Last relative decrease in function value was less than TolF.
  63 ##
  64 ## @item 0
  65 ## Iteration limit exceeded.
  66 ##
  67 ## @item -3
  68 ## The trust region radius became excessively small.
  69 ## @end table
  70 ##
  71 ## Optionally, fminunc can also yield a structure with convergence statistics
  72 ## (@var{output}), the output gradient (@var{grad}) and approximate Hessian
  73 ## (@var{hess}).
  74 ##
  75 ## Note: If you only have a single nonlinear equation of one variable, using
  76 ## @code{fminbnd} is usually a much better idea.
  77 ## @seealso{fminbnd, optimset}
  78 ## @end deftypefn
  79
  80 ## PKG_ADD: ## Discard result to avoid polluting workspace with ans at startup.
  81 ## PKG_ADD: [~] = __all_opts__ ("fminunc");
  82
  83 function [x, fval, info, output, grad, hess] = fminunc (fcn, x0, options = struct ())
  84
  85   ## Get default options if requested.
  86   if (nargin == 1 && ischar (fcn) && strcmp (fcn, 'defaults'))
  87     x = optimset ("MaxIter", 400, "MaxFunEvals", Inf, \
  88     "GradObj", "off", "TolX", 1e-7, "TolFun", 1e-7,
  89     "OutputFcn", [], "FunValCheck", "off",
  90     "FinDiffType", "central",
  91     "TypicalX", [], "AutoScaling", "off");
  92     return;
  93   endif
  94
  95   if (nargin < 2 || nargin > 3 || ! ismatrix (x0))
  96     print_usage ();
  97   endif
  98
  99   if (ischar (fcn))
 100     fcn = str2func (fcn, "global");
 101   endif
 102
 103   xsiz = size (x0);
 104   n = numel (x0);
 105
 106   has_grad = strcmpi (optimget (options, "GradObj", "off"), "on");
 107   cdif = strcmpi (optimget (options, "FinDiffType", "central"), "central");
 108   maxiter = optimget (options, "MaxIter", 400);
 109   maxfev = optimget (options, "MaxFunEvals", Inf);
 110   outfcn = optimget (options, "OutputFcn");
 111
 112   ## Get scaling matrix using the TypicalX option. If set to "auto", the
 113   ## scaling matrix is estimated using the jacobian.
 114   typicalx = optimget (options, "TypicalX");
 115   if (isempty (typicalx))
 116     typicalx = ones (n, 1);
 117   endif
 118   autoscale = strcmpi (optimget (options, "AutoScaling", "off"), "on");
 119   if (! autoscale)
 120     dg = 1 ./ typicalx;
 121   endif
 122
 123   funvalchk = strcmpi (optimget (options, "FunValCheck", "off"), "on");
 124
 125   if (funvalchk)
 126     ## Replace fcn with a guarded version.
 127     fcn = @(x) guarded_eval (fcn, x);
 128   endif
 129
 130   ## These defaults are rather stringent. I think that normally, user
 131   ## prefers accuracy to performance.
 132
 133   macheps = eps (class (x0));
 134
 135   tolx = optimget (options, "TolX", 1e-7);
 136   tolf = optimget (options, "TolFun", 1e-7);
 137
 138   factor = 0.1;
 139   ## FIXME: TypicalX corresponds to user scaling (???)
 140   autodg = true;
 141
 142   niter = 1;
 143   nfev = 0;
 144
 145   x = x0(:);
 146   info = 0;
 147
 148   ## Initial evaluation.
 149   fval = fcn (reshape (x, xsiz));
 150   n = length (x);
 151
 152   if (! isempty (outfcn))
 153     optimvalues.iter = niter;
 154     optimvalues.funccount = nfev;
 155     optimvalues.fval = fval;
 156     optimvalues.searchdirection = zeros (n, 1);
 157     state = 'init';
 158     stop = outfcn (x, optimvalues, state);
 159     if (stop)
 160       info = -1;
 161       break;
 162     endif
 163   endif
 164
 165   nsuciter = 0;
 166   lastratio = 0;
 167
 168   grad = [];
 169
 170   ## Outer loop.
 171   while (niter < maxiter && nfev < maxfev && ! info)
 172
 173     grad0 = grad;
 174
 175     ## Calculate function value and gradient (possibly via FD).
 176     if (has_grad)
 177       [fval, grad] = fcn (reshape (x, xsiz));
 178       grad = grad(:);
 179       nfev ++;
 180     else
 181       grad = __fdjac__ (fcn, reshape (x, xsiz), fval, typicalx, cdif)(:);
 182       nfev += (1 + cdif) * length (x);
 183     endif
 184
 185     if (niter == 1)
 186       ## Initialize by identity matrix.
 187       hesr = eye (n);
 188     else
 189       ## Use the damped BFGS formula.
 190       y = grad - grad0;
 191       sBs = sumsq (w);
 192       Bs = hesr'*w;
 193       sy = y'*s;
 194       theta = 0.8 / max (1 - sy / sBs, 0.8);
 195       r = theta * y + (1-theta) * Bs;
 196       hesr = cholupdate (hesr, r / sqrt (s'*r), "+");
 197       [hesr, info] = cholupdate (hesr, Bs / sqrt (sBs), "-");
 198       if (info)
 199         hesr = eye (n);
 200       endif
 201     endif
 202
 203     if (autoscale)
 204       ## Second derivatives approximate the hessian.
 205       d2f = norm (hesr, 'columns').';
 206       if (niter == 1)
 207         dg = d2f;
 208       else
 209         ## FIXME: maybe fixed lower and upper bounds?
 210         dg = max (0.1*dg, d2f);
 211       endif
 212     endif
 213
 214     if (niter == 1)
 215       xn = norm (dg .* x);
 216       ## FIXME: something better?
 217       delta = factor * max (xn, 1);
 218     endif
 219
 220     ## FIXME -- why tolf*n*xn? If abs (e) ~ abs(x) * eps is a vector
 221     ## of perturbations of x, then norm (hesr*e) <= eps*xn, i.e. by
 222     ## tolf ~ eps we demand as much accuracy as we can expect.
 223     if (norm (grad) <= tolf*n*xn)
 224       info = 1;
 225       break;
 226     endif
 227
 228     suc = false;
 229     decfac = 0.5;
 230
 231     ## Inner loop.
 232     while (! suc && niter <= maxiter && nfev < maxfev && ! info)
 233
 234       s = - __doglegm__ (hesr, grad, dg, delta);
 235
 236       sn = norm (dg .* s);
 237       if (niter == 1)
 238         delta = min (delta, sn);
 239       endif
 240
 241       fval1 = fcn (reshape (x + s, xsiz)) (:);
 242       nfev ++;
 243
 244       if (fval1 < fval)
 245         ## Scaled actual reduction.
 246         actred =  (fval - fval1) / (abs (fval1) + abs (fval));
 247       else
 248         actred = -1;
 249       endif
 250
 251       w = hesr*s;
 252       ## Scaled predicted reduction, and ratio.
 253       t = 1/2 * sumsq (w) + grad'*s;
 254       if (t < 0)
 255         prered = -t/(abs (fval) + abs (fval + t));
 256         ratio = actred / prered;
 257       else
 258         prered = 0;
 259         ratio = 0;
 260       endif
 261
 262       ## Update delta.
 263       if (ratio < min(max(0.1, 0.8*lastratio), 0.9))
 264         delta *= decfac;
 265         decfac ^= 1.4142;
 266         if (delta <= 1e1*macheps*xn)
 267           ## Trust region became uselessly small.
 268           info = -3;
 269           break;
 270         endif
 271       else
 272         lastratio = ratio;
 273         decfac = 0.5;
 274         if (abs (1-ratio) <= 0.1)
 275           delta = 1.4142*sn;
 276         elseif (ratio >= 0.5)
 277           delta = max (delta, 1.4142*sn);
 278         endif
 279       endif
 280
 281       if (ratio >= 1e-4)
 282         ## Successful iteration.
 283         x += s;
 284         xn = norm (dg .* x);
 285         fval = fval1;
 286         nsuciter ++;
 287         suc = true;
 288       endif
 289
 290       niter ++;
 291
 292       ## FIXME: should outputfcn be only called after a successful iteration?
 293       if (! isempty (outfcn))
 294         optimvalues.iter = niter;
 295         optimvalues.funccount = nfev;
 296         optimvalues.fval = fval;
 297         optimvalues.searchdirection = s;
 298         state = 'iter';
 299         stop = outfcn (x, optimvalues, state);
 300         if (stop)
 301           info = -1;
 302           break;
 303         endif
 304       endif
 305
 306       ## Tests for termination conditions. A mysterious place, anything
 307       ## can happen if you change something here...
 308
 309       ## The rule of thumb (which I'm not sure M*b is quite following)
 310       ## is that for a tolerance that depends on scaling, only 0 makes
 311       ## sense as a default value. But 0 usually means uselessly long
 312       ## iterations, so we need scaling-independent tolerances wherever
 313       ## possible.
 314
 315       ## The following tests done only after successful step.
 316       if (ratio >= 1e-4)
 317         ## This one is classic. Note that we use scaled variables again,
 318         ## but compare to scaled step, so nothing bad.
 319         if (sn <= tolx*xn)
 320           info = 2;
 321           ## Again a classic one.
 322         elseif (actred < tolf)
 323           info = 3;
 324         endif
 325       endif
 326
 327     endwhile
 328   endwhile
 329
 330   ## Restore original shapes.
 331   x = reshape (x, xsiz);
 332
 333   output.iterations = niter;
 334   output.successful = nsuciter;
 335   output.funcCount = nfev;
 336
 337   if (nargout > 5)
 338     hess = hesr'*hesr;
 339   endif
 340
 341 endfunction
 342
 343 ## An assistant function that evaluates a function handle and checks for
 344 ## bad results.
 345 function [fx, gx] = guarded_eval (fun, x)
 346   if (nargout > 1)
 347     [fx, gx] = fun (x);
 348   else
 349     fx = fun (x);
 350     gx = [];
 351   endif
 352
 353   if (! (isreal (fx) && isreal (gx)))
 354     error ("fminunc:notreal", "fminunc: non-real value encountered");
 355   elseif (any (isnan (fx(:))))
 356     error ("fminunc:isnan", "fminunc: NaN value encountered");
 357   endif
 358 endfunction
 359
 360 %!function f = __rosenb (x)
 361 %!  n = length (x);
 362 %!  f = sumsq (1 - x(1:n-1)) + 100 * sumsq (x(2:n) - x(1:n-1).^2);
 363 %!endfunction
 364 %!test
 365 %! [x, fval, info, out] = fminunc (@__rosenb, [5, -5]);
 366 %! tol = 2e-5;
 367 %! assert (info > 0);
 368 %! assert (x, ones (1, 2), tol);
 369 %! assert (fval, 0, tol);
 370 %!test
 371 %! [x, fval, info, out] = fminunc (@__rosenb, zeros (1, 4));
 372 %! tol = 2e-5;
 373 %! assert (info > 0);
 374 %! assert (x, ones (1, 4), tol);
 375 %! assert (fval, 0, tol);
 376
 377 ## Solve the double dogleg trust-region minimization problem:
 378 ## Minimize 1/2*norm(r*x)^2  subject to the constraint norm(d.*x) <= delta,
 379 ## x being a convex combination of the gauss-newton and scaled gradient.
 380
 381 ## TODO: error checks
 382 ## TODO: handle singularity, or leave it up to mldivide?
 383
 384 function x = __doglegm__ (r, g, d, delta)
 385   ## Get Gauss-Newton direction.
 386   b = r' \ g;
 387   x = r \ b;
 388   xn = norm (d .* x);
 389   if (xn > delta)
 390     ## GN is too big, get scaled gradient.
 391     s = g ./ d;
 392     sn = norm (s);
 393     if (sn > 0)
 394       ## Normalize and rescale.
 395       s = (s / sn) ./ d;
 396       ## Get the line minimizer in s direction.
 397       tn = norm (r*s);
 398       snm = (sn / tn) / tn;
 399       if (snm < delta)
 400         ## Get the dogleg path minimizer.
 401         bn = norm (b);
 402         dxn = delta/xn; snmd = snm/delta;
 403         t = (bn/sn) * (bn/xn) * snmd;
 404         t -= dxn * snmd^2 - sqrt ((t-dxn)^2 + (1-dxn^2)*(1-snmd^2));
 405         alpha = dxn*(1-snmd^2) / t;
 406       else
 407         alpha = 0;
 408       endif
 409     else
 410       alpha = delta / xn;
 411       snm = 0;
 412     endif
 413     ## Form the appropriate convex combination.
 414     x = alpha * x + ((1-alpha) * min (snm, delta)) * s;
 415   endif
 416 endfunction