Add a useful package (from Source forge) for octave

[CreaPhase.git] / octave_packages / statistics-1.1.3 / hmmestimate.m
diff --git a/octave_packages/statistics-1.1.3/hmmestimate.m b/octave_packages/statistics-1.1.3/hmmestimate.m

new file mode 100644 (file)

index 0000000..dbbff5b
--- /dev/null
+++ b/octave_packages/statistics-1.1.3/hmmestimate.m
@@ -0,0 +1,338 @@
+## Copyright (C) 2006, 2007 Arno Onken <asnelt@asnelt.org>
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see <http://www.gnu.org/licenses/>.
+
+## -*- texinfo -*-
+## @deftypefn {Function File} {[@var{transprobest}, @var{outprobest}] =} hmmestimate (@var{sequence}, @var{states})
+## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'statenames', @var{statenames})
+## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'symbols', @var{symbols})
+## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'pseudotransitions', @var{pseudotransitions})
+## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'pseudoemissions', @var{pseudoemissions})
+## Estimate the matrix of transition probabilities and the matrix of output
+## probabilities of a given sequence of outputs and states generated by a
+## hidden Markov model. The model assumes that the generation starts in
+## state @code{1} at step @code{0} but does not include step @code{0} in the
+## generated states and sequence.
+##
+## @subheading Arguments
+##
+## @itemize @bullet
+## @item
+## @var{sequence} is a vector of a sequence of given outputs. The outputs
+## must be integers ranging from @code{1} to the number of outputs of the
+## hidden Markov model.
+##
+## @item
+## @var{states} is a vector of the same length as @var{sequence} of given
+## states. The states must be integers ranging from @code{1} to the number
+## of states of the hidden Markov model.
+## @end itemize
+##
+## @subheading Return values
+##
+## @itemize @bullet
+## @item
+## @var{transprobest} is the matrix of the estimated transition
+## probabilities of the states. @code{transprobest(i, j)} is the estimated
+## probability of a transition to state @code{j} given state @code{i}.
+##
+## @item
+## @var{outprobest} is the matrix of the estimated output probabilities.
+## @code{outprobest(i, j)} is the estimated probability of generating
+## output @code{j} given state @code{i}.
+## @end itemize
+##
+## If @code{'symbols'} is specified, then @var{sequence} is expected to be a
+## sequence of the elements of @var{symbols} instead of integers.
+## @var{symbols} can be a cell array.
+##
+## If @code{'statenames'} is specified, then @var{states} is expected to be
+## a sequence of the elements of @var{statenames} instead of integers.
+## @var{statenames} can be a cell array.
+##
+## If @code{'pseudotransitions'} is specified then the integer matrix
+## @var{pseudotransitions} is used as an initial number of counted
+## transitions. @code{pseudotransitions(i, j)} is the initial number of
+## counted transitions from state @code{i} to state @code{j}.
+## @var{transprobest} will have the same size as @var{pseudotransitions}.
+## Use this if you have transitions that are very unlikely to occur.
+##
+## If @code{'pseudoemissions'} is specified then the integer matrix
+## @var{pseudoemissions} is used as an initial number of counted outputs.
+## @code{pseudoemissions(i, j)} is the initial number of counted outputs
+## @code{j} given state @code{i}. If @code{'pseudoemissions'} is also
+## specified then the number of rows of @var{pseudoemissions} must be the
+## same as the number of rows of @var{pseudotransitions}. @var{outprobest}
+## will have the same size as @var{pseudoemissions}. Use this if you have
+## outputs or states that are very unlikely to occur.
+##
+## @subheading Examples
+##
+## @example
+## @group
+## transprob = [0.8, 0.2; 0.4, 0.6];
+## outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1];
+## [sequence, states] = hmmgenerate (25, transprob, outprob);
+## [transprobest, outprobest] = hmmestimate (sequence, states)
+## @end group
+##
+## @group
+## symbols = @{'A', 'B', 'C'@};
+## statenames = @{'One', 'Two'@};
+## [sequence, states] = hmmgenerate (25, transprob, outprob,
+##                      'symbols', symbols, 'statenames', statenames);
+## [transprobest, outprobest] = hmmestimate (sequence, states,
+##                              'symbols', symbols,
+##                              'statenames', statenames)
+## @end group
+##
+## @group
+## pseudotransitions = [8, 2; 4, 6];
+## pseudoemissions = [2, 4, 4; 7, 2, 1];
+## [sequence, states] = hmmgenerate (25, transprob, outprob);
+## [transprobest, outprobest] = hmmestimate (sequence, states, 'pseudotransitions', pseudotransitions, 'pseudoemissions', pseudoemissions)
+## @end group
+## @end example
+##
+## @subheading References
+##
+## @enumerate
+## @item
+## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics
+## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC,
+## 2001.
+##
+## @item
+## Lawrence R. Rabiner. A Tutorial on Hidden Markov Models and Selected
+## Applications in Speech Recognition. @cite{Proceedings of the IEEE},
+## 77(2), pages 257-286, February 1989.
+## @end enumerate
+## @end deftypefn
+
+## Author: Arno Onken <asnelt@asnelt.org>
+## Description: Estimation of a hidden Markov model for a given sequence
+
+function [transprobest, outprobest] = hmmestimate (sequence, states, varargin)
+
+  # Check arguments
+  if (nargin < 2 || mod (length (varargin), 2) != 0)
+    print_usage ();
+  endif
+
+  len = length (sequence);
+  if (length (states) != len)
+    error ("hmmestimate: sequence and states must have equal length");
+  endif
+
+  # Flag for symbols
+  usesym = false;
+  # Flag for statenames
+  usesn = false;
+
+  # Variables for return values
+  transprobest = [];
+  outprobest = [];
+
+  # Process varargin
+  for i = 1:2:length (varargin)
+    # There must be an identifier: 'symbols', 'statenames',
+    # 'pseudotransitions' or 'pseudoemissions'
+    if (! ischar (varargin{i}))
+      print_usage ();
+    endif
+    # Upper case is also fine
+    lowerarg = lower (varargin{i});
+    if (strcmp (lowerarg, 'symbols'))
+      usesym = true;
+      # Use the following argument as symbols
+      symbols = varargin{i + 1};
+    # The same for statenames
+    elseif (strcmp (lowerarg, 'statenames'))
+      usesn = true;
+      # Use the following argument as statenames
+      statenames = varargin{i + 1};
+    elseif (strcmp (lowerarg, 'pseudotransitions'))
+      # Use the following argument as an initial count for transitions
+      transprobest = varargin{i + 1};
+      if (! ismatrix (transprobest))
+        error ("hmmestimate: pseudotransitions must be a non-empty numeric matrix");
+      endif
+      if (rows (transprobest) != columns (transprobest))
+        error ("hmmestimate: pseudotransitions must be a square matrix");
+      endif
+    elseif (strcmp (lowerarg, 'pseudoemissions'))
+      # Use the following argument as an initial count for outputs
+      outprobest = varargin{i + 1};
+      if (! ismatrix (outprobest))
+        error ("hmmestimate: pseudoemissions must be a non-empty numeric matrix");
+      endif
+    else
+      error ("hmmestimate: expected 'symbols', 'statenames', 'pseudotransitions' or 'pseudoemissions' but found '%s'", varargin{i});
+    endif
+  endfor
+
+  # Transform sequence from symbols to integers if necessary
+  if (usesym)
+    # sequenceint is used to build the transformed sequence
+    sequenceint = zeros (1, len);
+    for i = 1:length (symbols)
+      # Search for symbols(i) in the sequence, isequal will have 1 at
+      # corresponding indices; i is the right integer for that symbol
+      isequal = ismember (sequence, symbols(i));
+      # We do not want to change sequenceint if the symbol appears a second
+      # time in symbols
+      if (any ((sequenceint == 0) & (isequal == 1)))
+        isequal *= i;
+        sequenceint += isequal;
+      endif
+    endfor
+    if (! all (sequenceint))
+      index = max ((sequenceint == 0) .* (1:len));
+      error (["hmmestimate: sequence(" int2str (index) ") not in symbols"]);
+    endif
+    sequence = sequenceint;
+  else
+    if (! isvector (sequence))
+      error ("hmmestimate: sequence must be a non-empty vector");
+    endif
+    if (! all (ismember (sequence, 1:max (sequence))))
+      index = max ((ismember (sequence, 1:max (sequence)) == 0) .* (1:len));
+      error (["hmmestimate: sequence(" int2str (index) ") not feasible"]);
+    endif
+  endif
+
+  # Transform states from statenames to integers if necessary
+  if (usesn)
+    # statesint is used to build the transformed states
+    statesint = zeros (1, len);
+    for i = 1:length (statenames)
+      # Search for statenames(i) in states, isequal will have 1 at
+      # corresponding indices; i is the right integer for that statename
+      isequal = ismember (states, statenames(i));
+      # We do not want to change statesint if the statename appears a second
+      # time in statenames
+      if (any ((statesint == 0) & (isequal == 1)))
+        isequal *= i;
+        statesint += isequal;
+      endif
+    endfor
+    if (! all (statesint))
+      index = max ((statesint == 0) .* (1:len));
+      error (["hmmestimate: states(" int2str (index) ") not in statenames"]);
+    endif
+    states = statesint;
+  else
+    if (! isvector (states))
+      error ("hmmestimate: states must be a non-empty vector");
+    endif
+    if (! all (ismember (states, 1:max (states))))
+      index = max ((ismember (states, 1:max (states)) == 0) .* (1:len));
+      error (["hmmestimate: states(" int2str (index) ") not feasible"]);
+    endif
+  endif
+
+  # Estimate the number of different states as the max of states  
+  nstate = max (states);
+  # Estimate the number of different outputs as the max of sequence
+  noutput = max (sequence);
+
+  # transprobest is empty if pseudotransitions is not specified
+  if (isempty (transprobest))
+    # outprobest is not empty if pseudoemissions is specified
+    if (! isempty (outprobest))
+      if (nstate > rows (outprobest))
+        error ("hmmestimate: not enough rows in pseudoemissions");
+      endif
+      # The number of states is specified by pseudoemissions
+      nstate = rows (outprobest);
+    endif
+    transprobest = zeros (nstate, nstate);
+  else
+    if (nstate > rows (transprobest))
+      error ("hmmestimate: not enough rows in pseudotransitions");
+    endif
+    # The number of states is given by pseudotransitions
+    nstate = rows (transprobest);
+  endif
+
+  # outprobest is empty if pseudoemissions is not specified
+  if (isempty (outprobest))
+    outprobest = zeros (nstate, noutput);
+  else
+    if (noutput > columns (outprobest))
+      error ("hmmestimate: not enough columns in pseudoemissions");
+    endif
+    # Number of outputs is specified by pseudoemissions
+    noutput = columns (outprobest);
+    if (rows (outprobest) != nstate)
+      error ("hmmestimate: pseudoemissions must have the same number of rows as pseudotransitions");
+    endif
+  endif
+
+  # Assume that the model started in state 1
+  cstate = 1;
+  for i = 1:len
+    # Count the number of transitions for each state pair
+    transprobest(cstate, states(i)) ++;
+    cstate = states (i);
+    # Count the number of outputs for each state output pair
+    outprobest(cstate, sequence(i)) ++;
+  endfor
+
+  # transprobest and outprobest contain counted numbers
+  # Each row in transprobest and outprobest should contain estimated
+  # probabilities
+  # => scale so that the sum is 1
+  # A zero row remains zero
+  # - for transprobest
+  s = sum (transprobest, 2);
+  s(s == 0) = 1;
+  transprobest = transprobest ./ (s * ones (1, nstate));
+  # - for outprobest
+  s = sum (outprobest, 2);
+  s(s == 0) = 1;
+  outprobest = outprobest ./ (s * ones (1, noutput));
+
+endfunction
+
+%!test
+%! sequence = [1, 2, 1, 1, 1, 2, 2, 1, 2, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 3];
+%! states = [1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1];
+%! [transprobest, outprobest] = hmmestimate (sequence, states);
+%! expectedtransprob = [0.88889, 0.11111; 0.28571, 0.71429];
+%! expectedoutprob = [0.16667, 0.33333, 0.50000; 1.00000, 0.00000, 0.00000];
+%! assert (transprobest, expectedtransprob, 0.001);
+%! assert (outprobest, expectedoutprob, 0.001);
+
+%!test
+%! sequence = {'A', 'B', 'A', 'A', 'A', 'B', 'B', 'A', 'B', 'C', 'C', 'C', 'C', 'B', 'C', 'A', 'A', 'A', 'A', 'C', 'C', 'B', 'C', 'A', 'C'};
+%! states = {'One', 'One', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'Two', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One'};
+%! symbols = {'A', 'B', 'C'};
+%! statenames = {'One', 'Two'};
+%! [transprobest, outprobest] = hmmestimate (sequence, states, 'symbols', symbols, 'statenames', statenames);
+%! expectedtransprob = [0.88889, 0.11111; 0.28571, 0.71429];
+%! expectedoutprob = [0.16667, 0.33333, 0.50000; 1.00000, 0.00000, 0.00000];
+%! assert (transprobest, expectedtransprob, 0.001);
+%! assert (outprobest, expectedoutprob, 0.001);
+
+%!test
+%! sequence = [1, 2, 1, 1, 1, 2, 2, 1, 2, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 3];
+%! states = [1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1];
+%! pseudotransitions = [8, 2; 4, 6];
+%! pseudoemissions = [2, 4, 4; 7, 2, 1];
+%! [transprobest, outprobest] = hmmestimate (sequence, states, 'pseudotransitions', pseudotransitions, 'pseudoemissions', pseudoemissions);
+%! expectedtransprob = [0.85714, 0.14286; 0.35294, 0.64706];
+%! expectedoutprob = [0.178571, 0.357143, 0.464286; 0.823529, 0.117647, 0.058824];
+%! assert (transprobest, expectedtransprob, 0.001);
+%! assert (outprobest, expectedoutprob, 0.001);