1 ## Copyright (C) 2008-2012 VZLU Prague, a.s.
3 ## This file is part of Octave.
5 ## Octave is free software; you can redistribute it and/or modify it
6 ## under the terms of the GNU General Public License as published by
7 ## the Free Software Foundation; either version 3 of the License, or (at
8 ## your option) any later version.
10 ## Octave is distributed in the hope that it will be useful, but
11 ## WITHOUT ANY WARRANTY; without even the implied warranty of
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ## General Public License for more details.
15 ## You should have received a copy of the GNU General Public License
16 ## along with Octave; see the file COPYING. If not, see
17 ## <http://www.gnu.org/licenses/>.
19 ## Author: Jaroslav Hajek <highegg@gmail.com>
22 ## @deftypefn {Function File} {} fminunc (@var{fcn}, @var{x0})
23 ## @deftypefnx {Function File} {} fminunc (@var{fcn}, @var{x0}, @var{options})
24 ## @deftypefnx {Function File} {[@var{x}, @var{fvec}, @var{info}, @var{output}, @var{grad}, @var{hess}] =} fminunc (@var{fcn}, @dots{})
25 ## Solve an unconstrained optimization problem defined by the function
27 ## @var{fcn} should accepts a vector (array) defining the unknown variables,
28 ## and return the objective function value, optionally with gradient.
29 ## In other words, this function attempts to determine a vector @var{x} such
30 ## that @code{@var{fcn} (@var{x})} is a local minimum.
31 ## @var{x0} determines a starting guess. The shape of @var{x0} is preserved
32 ## in all calls to @var{fcn}, but otherwise is treated as a column vector.
33 ## @var{options} is a structure specifying additional options.
34 ## Currently, @code{fminunc} recognizes these options:
35 ## @code{"FunValCheck"}, @code{"OutputFcn"}, @code{"TolX"},
36 ## @code{"TolFun"}, @code{"MaxIter"}, @code{"MaxFunEvals"},
37 ## @code{"GradObj"}, @code{"FinDiffType"},
38 ## @code{"TypicalX"}, @code{"AutoScaling"}.
40 ## If @code{"GradObj"} is @code{"on"}, it specifies that @var{fcn},
41 ## called with 2 output arguments, also returns the Jacobian matrix
42 ## of right-hand sides at the requested point. @code{"TolX"} specifies
43 ## the termination tolerance in the unknown variables, while
44 ## @code{"TolFun"} is a tolerance for equations. Default is @code{1e-7}
45 ## for both @code{"TolX"} and @code{"TolFun"}.
47 ## For description of the other options, see @code{optimset}.
49 ## On return, @var{fval} contains the value of the function @var{fcn}
50 ## evaluated at @var{x}, and @var{info} may be one of the following values:
54 ## Converged to a solution point. Relative gradient error is less than
59 ## Last relative step size was less that TolX.
62 ## Last relative decrease in function value was less than TolF.
65 ## Iteration limit exceeded.
68 ## The trust region radius became excessively small.
71 ## Optionally, fminunc can also yield a structure with convergence statistics
72 ## (@var{output}), the output gradient (@var{grad}) and approximate Hessian
75 ## Note: If you only have a single nonlinear equation of one variable, using
76 ## @code{fminbnd} is usually a much better idea.
77 ## @seealso{fminbnd, optimset}
80 ## PKG_ADD: ## Discard result to avoid polluting workspace with ans at startup.
81 ## PKG_ADD: [~] = __all_opts__ ("fminunc");
83 function [x, fval, info, output, grad, hess] = fminunc (fcn, x0, options = struct ())
85 ## Get default options if requested.
86 if (nargin == 1 && ischar (fcn) && strcmp (fcn, 'defaults'))
87 x = optimset ("MaxIter", 400, "MaxFunEvals", Inf, \
88 "GradObj", "off", "TolX", 1e-7, "TolFun", 1e-7,
89 "OutputFcn", [], "FunValCheck", "off",
90 "FinDiffType", "central",
91 "TypicalX", [], "AutoScaling", "off");
95 if (nargin < 2 || nargin > 3 || ! ismatrix (x0))
100 fcn = str2func (fcn, "global");
106 has_grad = strcmpi (optimget (options, "GradObj", "off"), "on");
107 cdif = strcmpi (optimget (options, "FinDiffType", "central"), "central");
108 maxiter = optimget (options, "MaxIter", 400);
109 maxfev = optimget (options, "MaxFunEvals", Inf);
110 outfcn = optimget (options, "OutputFcn");
112 ## Get scaling matrix using the TypicalX option. If set to "auto", the
113 ## scaling matrix is estimated using the jacobian.
114 typicalx = optimget (options, "TypicalX");
115 if (isempty (typicalx))
116 typicalx = ones (n, 1);
118 autoscale = strcmpi (optimget (options, "AutoScaling", "off"), "on");
123 funvalchk = strcmpi (optimget (options, "FunValCheck", "off"), "on");
126 ## Replace fcn with a guarded version.
127 fcn = @(x) guarded_eval (fcn, x);
130 ## These defaults are rather stringent. I think that normally, user
131 ## prefers accuracy to performance.
133 macheps = eps (class (x0));
135 tolx = optimget (options, "TolX", 1e-7);
136 tolf = optimget (options, "TolFun", 1e-7);
139 ## FIXME: TypicalX corresponds to user scaling (???)
148 ## Initial evaluation.
149 fval = fcn (reshape (x, xsiz));
152 if (! isempty (outfcn))
153 optimvalues.iter = niter;
154 optimvalues.funccount = nfev;
155 optimvalues.fval = fval;
156 optimvalues.searchdirection = zeros (n, 1);
158 stop = outfcn (x, optimvalues, state);
171 while (niter < maxiter && nfev < maxfev && ! info)
175 ## Calculate function value and gradient (possibly via FD).
177 [fval, grad] = fcn (reshape (x, xsiz));
181 grad = __fdjac__ (fcn, reshape (x, xsiz), fval, typicalx, cdif)(:);
182 nfev += (1 + cdif) * length (x);
186 ## Initialize by identity matrix.
189 ## Use the damped BFGS formula.
194 theta = 0.8 / max (1 - sy / sBs, 0.8);
195 r = theta * y + (1-theta) * Bs;
196 hesr = cholupdate (hesr, r / sqrt (s'*r), "+");
197 [hesr, info] = cholupdate (hesr, Bs / sqrt (sBs), "-");
204 ## Second derivatives approximate the hessian.
205 d2f = norm (hesr, 'columns').';
209 ## FIXME: maybe fixed lower and upper bounds?
210 dg = max (0.1*dg, d2f);
216 ## FIXME: something better?
217 delta = factor * max (xn, 1);
220 ## FIXME -- why tolf*n*xn? If abs (e) ~ abs(x) * eps is a vector
221 ## of perturbations of x, then norm (hesr*e) <= eps*xn, i.e. by
222 ## tolf ~ eps we demand as much accuracy as we can expect.
223 if (norm (grad) <= tolf*n*xn)
232 while (! suc && niter <= maxiter && nfev < maxfev && ! info)
234 s = - __doglegm__ (hesr, grad, dg, delta);
238 delta = min (delta, sn);
241 fval1 = fcn (reshape (x + s, xsiz)) (:);
245 ## Scaled actual reduction.
246 actred = (fval - fval1) / (abs (fval1) + abs (fval));
252 ## Scaled predicted reduction, and ratio.
253 t = 1/2 * sumsq (w) + grad'*s;
255 prered = -t/(abs (fval) + abs (fval + t));
256 ratio = actred / prered;
263 if (ratio < min(max(0.1, 0.8*lastratio), 0.9))
266 if (delta <= 1e1*macheps*xn)
267 ## Trust region became uselessly small.
274 if (abs (1-ratio) <= 0.1)
276 elseif (ratio >= 0.5)
277 delta = max (delta, 1.4142*sn);
282 ## Successful iteration.
292 ## FIXME: should outputfcn be only called after a successful iteration?
293 if (! isempty (outfcn))
294 optimvalues.iter = niter;
295 optimvalues.funccount = nfev;
296 optimvalues.fval = fval;
297 optimvalues.searchdirection = s;
299 stop = outfcn (x, optimvalues, state);
306 ## Tests for termination conditions. A mysterious place, anything
307 ## can happen if you change something here...
309 ## The rule of thumb (which I'm not sure M*b is quite following)
310 ## is that for a tolerance that depends on scaling, only 0 makes
311 ## sense as a default value. But 0 usually means uselessly long
312 ## iterations, so we need scaling-independent tolerances wherever
315 ## The following tests done only after successful step.
317 ## This one is classic. Note that we use scaled variables again,
318 ## but compare to scaled step, so nothing bad.
321 ## Again a classic one.
322 elseif (actred < tolf)
330 ## Restore original shapes.
331 x = reshape (x, xsiz);
333 output.iterations = niter;
334 output.successful = nsuciter;
335 output.funcCount = nfev;
343 ## An assistant function that evaluates a function handle and checks for
345 function [fx, gx] = guarded_eval (fun, x)
353 if (! (isreal (fx) && isreal (gx)))
354 error ("fminunc:notreal", "fminunc: non-real value encountered");
355 elseif (any (isnan (fx(:))))
356 error ("fminunc:isnan", "fminunc: NaN value encountered");
360 %!function f = __rosenb (x)
362 %! f = sumsq (1 - x(1:n-1)) + 100 * sumsq (x(2:n) - x(1:n-1).^2);
365 %! [x, fval, info, out] = fminunc (@__rosenb, [5, -5]);
367 %! assert (info > 0);
368 %! assert (x, ones (1, 2), tol);
369 %! assert (fval, 0, tol);
371 %! [x, fval, info, out] = fminunc (@__rosenb, zeros (1, 4));
373 %! assert (info > 0);
374 %! assert (x, ones (1, 4), tol);
375 %! assert (fval, 0, tol);
377 ## Solve the double dogleg trust-region minimization problem:
378 ## Minimize 1/2*norm(r*x)^2 subject to the constraint norm(d.*x) <= delta,
379 ## x being a convex combination of the gauss-newton and scaled gradient.
381 ## TODO: error checks
382 ## TODO: handle singularity, or leave it up to mldivide?
384 function x = __doglegm__ (r, g, d, delta)
385 ## Get Gauss-Newton direction.
390 ## GN is too big, get scaled gradient.
394 ## Normalize and rescale.
396 ## Get the line minimizer in s direction.
398 snm = (sn / tn) / tn;
400 ## Get the dogleg path minimizer.
402 dxn = delta/xn; snmd = snm/delta;
403 t = (bn/sn) * (bn/xn) * snmd;
404 t -= dxn * snmd^2 - sqrt ((t-dxn)^2 + (1-dxn^2)*(1-snmd^2));
405 alpha = dxn*(1-snmd^2) / t;
413 ## Form the appropriate convex combination.
414 x = alpha * x + ((1-alpha) * min (snm, delta)) * s;