octave_packages/statistics-1.1.3/hmmestimate.m

   1 ## Copyright (C) 2006, 2007 Arno Onken <asnelt@asnelt.org>
   2 ##
   3 ## This program is free software; you can redistribute it and/or modify it under
   4 ## the terms of the GNU General Public License as published by the Free Software
   5 ## Foundation; either version 3 of the License, or (at your option) any later
   6 ## version.
   7 ##
   8 ## This program is distributed in the hope that it will be useful, but WITHOUT
   9 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  11 ## details.
  12 ##
  13 ## You should have received a copy of the GNU General Public License along with
  14 ## this program; if not, see <http://www.gnu.org/licenses/>.
  15
  16 ## -*- texinfo -*-
  17 ## @deftypefn {Function File} {[@var{transprobest}, @var{outprobest}] =} hmmestimate (@var{sequence}, @var{states})
  18 ## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'statenames', @var{statenames})
  19 ## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'symbols', @var{symbols})
  20 ## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'pseudotransitions', @var{pseudotransitions})
  21 ## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'pseudoemissions', @var{pseudoemissions})
  22 ## Estimate the matrix of transition probabilities and the matrix of output
  23 ## probabilities of a given sequence of outputs and states generated by a
  24 ## hidden Markov model. The model assumes that the generation starts in
  25 ## state @code{1} at step @code{0} but does not include step @code{0} in the
  26 ## generated states and sequence.
  27 ##
  28 ## @subheading Arguments
  29 ##
  30 ## @itemize @bullet
  31 ## @item
  32 ## @var{sequence} is a vector of a sequence of given outputs. The outputs
  33 ## must be integers ranging from @code{1} to the number of outputs of the
  34 ## hidden Markov model.
  35 ##
  36 ## @item
  37 ## @var{states} is a vector of the same length as @var{sequence} of given
  38 ## states. The states must be integers ranging from @code{1} to the number
  39 ## of states of the hidden Markov model.
  40 ## @end itemize
  41 ##
  42 ## @subheading Return values
  43 ##
  44 ## @itemize @bullet
  45 ## @item
  46 ## @var{transprobest} is the matrix of the estimated transition
  47 ## probabilities of the states. @code{transprobest(i, j)} is the estimated
  48 ## probability of a transition to state @code{j} given state @code{i}.
  49 ##
  50 ## @item
  51 ## @var{outprobest} is the matrix of the estimated output probabilities.
  52 ## @code{outprobest(i, j)} is the estimated probability of generating
  53 ## output @code{j} given state @code{i}.
  54 ## @end itemize
  55 ##
  56 ## If @code{'symbols'} is specified, then @var{sequence} is expected to be a
  57 ## sequence of the elements of @var{symbols} instead of integers.
  58 ## @var{symbols} can be a cell array.
  59 ##
  60 ## If @code{'statenames'} is specified, then @var{states} is expected to be
  61 ## a sequence of the elements of @var{statenames} instead of integers.
  62 ## @var{statenames} can be a cell array.
  63 ##
  64 ## If @code{'pseudotransitions'} is specified then the integer matrix
  65 ## @var{pseudotransitions} is used as an initial number of counted
  66 ## transitions. @code{pseudotransitions(i, j)} is the initial number of
  67 ## counted transitions from state @code{i} to state @code{j}.
  68 ## @var{transprobest} will have the same size as @var{pseudotransitions}.
  69 ## Use this if you have transitions that are very unlikely to occur.
  70 ##
  71 ## If @code{'pseudoemissions'} is specified then the integer matrix
  72 ## @var{pseudoemissions} is used as an initial number of counted outputs.
  73 ## @code{pseudoemissions(i, j)} is the initial number of counted outputs
  74 ## @code{j} given state @code{i}. If @code{'pseudoemissions'} is also
  75 ## specified then the number of rows of @var{pseudoemissions} must be the
  76 ## same as the number of rows of @var{pseudotransitions}. @var{outprobest}
  77 ## will have the same size as @var{pseudoemissions}. Use this if you have
  78 ## outputs or states that are very unlikely to occur.
  79 ##
  80 ## @subheading Examples
  81 ##
  82 ## @example
  83 ## @group
  84 ## transprob = [0.8, 0.2; 0.4, 0.6];
  85 ## outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1];
  86 ## [sequence, states] = hmmgenerate (25, transprob, outprob);
  87 ## [transprobest, outprobest] = hmmestimate (sequence, states)
  88 ## @end group
  89 ##
  90 ## @group
  91 ## symbols = @{'A', 'B', 'C'@};
  92 ## statenames = @{'One', 'Two'@};
  93 ## [sequence, states] = hmmgenerate (25, transprob, outprob,
  94 ##                      'symbols', symbols, 'statenames', statenames);
  95 ## [transprobest, outprobest] = hmmestimate (sequence, states,
  96 ##                              'symbols', symbols,
  97 ##                              'statenames', statenames)
  98 ## @end group
  99 ##
 100 ## @group
 101 ## pseudotransitions = [8, 2; 4, 6];
 102 ## pseudoemissions = [2, 4, 4; 7, 2, 1];
 103 ## [sequence, states] = hmmgenerate (25, transprob, outprob);
 104 ## [transprobest, outprobest] = hmmestimate (sequence, states, 'pseudotransitions', pseudotransitions, 'pseudoemissions', pseudoemissions)
 105 ## @end group
 106 ## @end example
 107 ##
 108 ## @subheading References
 109 ##
 110 ## @enumerate
 111 ## @item
 112 ## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics
 113 ## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC,
 114 ## 2001.
 115 ##
 116 ## @item
 117 ## Lawrence R. Rabiner. A Tutorial on Hidden Markov Models and Selected
 118 ## Applications in Speech Recognition. @cite{Proceedings of the IEEE},
 119 ## 77(2), pages 257-286, February 1989.
 120 ## @end enumerate
 121 ## @end deftypefn
 122
 123 ## Author: Arno Onken <asnelt@asnelt.org>
 124 ## Description: Estimation of a hidden Markov model for a given sequence
 125
 126 function [transprobest, outprobest] = hmmestimate (sequence, states, varargin)
 127
 128   # Check arguments
 129   if (nargin < 2 || mod (length (varargin), 2) != 0)
 130     print_usage ();
 131   endif
 132
 133   len = length (sequence);
 134   if (length (states) != len)
 135     error ("hmmestimate: sequence and states must have equal length");
 136   endif
 137
 138   # Flag for symbols
 139   usesym = false;
 140   # Flag for statenames
 141   usesn = false;
 142
 143   # Variables for return values
 144   transprobest = [];
 145   outprobest = [];
 146
 147   # Process varargin
 148   for i = 1:2:length (varargin)
 149     # There must be an identifier: 'symbols', 'statenames',
 150     # 'pseudotransitions' or 'pseudoemissions'
 151     if (! ischar (varargin{i}))
 152       print_usage ();
 153     endif
 154     # Upper case is also fine
 155     lowerarg = lower (varargin{i});
 156     if (strcmp (lowerarg, 'symbols'))
 157       usesym = true;
 158       # Use the following argument as symbols
 159       symbols = varargin{i + 1};
 160     # The same for statenames
 161     elseif (strcmp (lowerarg, 'statenames'))
 162       usesn = true;
 163       # Use the following argument as statenames
 164       statenames = varargin{i + 1};
 165     elseif (strcmp (lowerarg, 'pseudotransitions'))
 166       # Use the following argument as an initial count for transitions
 167       transprobest = varargin{i + 1};
 168       if (! ismatrix (transprobest))
 169         error ("hmmestimate: pseudotransitions must be a non-empty numeric matrix");
 170       endif
 171       if (rows (transprobest) != columns (transprobest))
 172         error ("hmmestimate: pseudotransitions must be a square matrix");
 173       endif
 174     elseif (strcmp (lowerarg, 'pseudoemissions'))
 175       # Use the following argument as an initial count for outputs
 176       outprobest = varargin{i + 1};
 177       if (! ismatrix (outprobest))
 178         error ("hmmestimate: pseudoemissions must be a non-empty numeric matrix");
 179       endif
 180     else
 181       error ("hmmestimate: expected 'symbols', 'statenames', 'pseudotransitions' or 'pseudoemissions' but found '%s'", varargin{i});
 182     endif
 183   endfor
 184
 185   # Transform sequence from symbols to integers if necessary
 186   if (usesym)
 187     # sequenceint is used to build the transformed sequence
 188     sequenceint = zeros (1, len);
 189     for i = 1:length (symbols)
 190       # Search for symbols(i) in the sequence, isequal will have 1 at
 191       # corresponding indices; i is the right integer for that symbol
 192       isequal = ismember (sequence, symbols(i));
 193       # We do not want to change sequenceint if the symbol appears a second
 194       # time in symbols
 195       if (any ((sequenceint == 0) & (isequal == 1)))
 196         isequal *= i;
 197         sequenceint += isequal;
 198       endif
 199     endfor
 200     if (! all (sequenceint))
 201       index = max ((sequenceint == 0) .* (1:len));
 202       error (["hmmestimate: sequence(" int2str (index) ") not in symbols"]);
 203     endif
 204     sequence = sequenceint;
 205   else
 206     if (! isvector (sequence))
 207       error ("hmmestimate: sequence must be a non-empty vector");
 208     endif
 209     if (! all (ismember (sequence, 1:max (sequence))))
 210       index = max ((ismember (sequence, 1:max (sequence)) == 0) .* (1:len));
 211       error (["hmmestimate: sequence(" int2str (index) ") not feasible"]);
 212     endif
 213   endif
 214
 215   # Transform states from statenames to integers if necessary
 216   if (usesn)
 217     # statesint is used to build the transformed states
 218     statesint = zeros (1, len);
 219     for i = 1:length (statenames)
 220       # Search for statenames(i) in states, isequal will have 1 at
 221       # corresponding indices; i is the right integer for that statename
 222       isequal = ismember (states, statenames(i));
 223       # We do not want to change statesint if the statename appears a second
 224       # time in statenames
 225       if (any ((statesint == 0) & (isequal == 1)))
 226         isequal *= i;
 227         statesint += isequal;
 228       endif
 229     endfor
 230     if (! all (statesint))
 231       index = max ((statesint == 0) .* (1:len));
 232       error (["hmmestimate: states(" int2str (index) ") not in statenames"]);
 233     endif
 234     states = statesint;
 235   else
 236     if (! isvector (states))
 237       error ("hmmestimate: states must be a non-empty vector");
 238     endif
 239     if (! all (ismember (states, 1:max (states))))
 240       index = max ((ismember (states, 1:max (states)) == 0) .* (1:len));
 241       error (["hmmestimate: states(" int2str (index) ") not feasible"]);
 242     endif
 243   endif
 244
 245   # Estimate the number of different states as the max of states
 246   nstate = max (states);
 247   # Estimate the number of different outputs as the max of sequence
 248   noutput = max (sequence);
 249
 250   # transprobest is empty if pseudotransitions is not specified
 251   if (isempty (transprobest))
 252     # outprobest is not empty if pseudoemissions is specified
 253     if (! isempty (outprobest))
 254       if (nstate > rows (outprobest))
 255         error ("hmmestimate: not enough rows in pseudoemissions");
 256       endif
 257       # The number of states is specified by pseudoemissions
 258       nstate = rows (outprobest);
 259     endif
 260     transprobest = zeros (nstate, nstate);
 261   else
 262     if (nstate > rows (transprobest))
 263       error ("hmmestimate: not enough rows in pseudotransitions");
 264     endif
 265     # The number of states is given by pseudotransitions
 266     nstate = rows (transprobest);
 267   endif
 268
 269   # outprobest is empty if pseudoemissions is not specified
 270   if (isempty (outprobest))
 271     outprobest = zeros (nstate, noutput);
 272   else
 273     if (noutput > columns (outprobest))
 274       error ("hmmestimate: not enough columns in pseudoemissions");
 275     endif
 276     # Number of outputs is specified by pseudoemissions
 277     noutput = columns (outprobest);
 278     if (rows (outprobest) != nstate)
 279       error ("hmmestimate: pseudoemissions must have the same number of rows as pseudotransitions");
 280     endif
 281   endif
 282
 283   # Assume that the model started in state 1
 284   cstate = 1;
 285   for i = 1:len
 286     # Count the number of transitions for each state pair
 287     transprobest(cstate, states(i)) ++;
 288     cstate = states (i);
 289     # Count the number of outputs for each state output pair
 290     outprobest(cstate, sequence(i)) ++;
 291   endfor
 292
 293   # transprobest and outprobest contain counted numbers
 294   # Each row in transprobest and outprobest should contain estimated
 295   # probabilities
 296   # => scale so that the sum is 1
 297   # A zero row remains zero
 298   # - for transprobest
 299   s = sum (transprobest, 2);
 300   s(s == 0) = 1;
 301   transprobest = transprobest ./ (s * ones (1, nstate));
 302   # - for outprobest
 303   s = sum (outprobest, 2);
 304   s(s == 0) = 1;
 305   outprobest = outprobest ./ (s * ones (1, noutput));
 306
 307 endfunction
 308
 309 %!test
 310 %! sequence = [1, 2, 1, 1, 1, 2, 2, 1, 2, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 3];
 311 %! states = [1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1];
 312 %! [transprobest, outprobest] = hmmestimate (sequence, states);
 313 %! expectedtransprob = [0.88889, 0.11111; 0.28571, 0.71429];
 314 %! expectedoutprob = [0.16667, 0.33333, 0.50000; 1.00000, 0.00000, 0.00000];
 315 %! assert (transprobest, expectedtransprob, 0.001);
 316 %! assert (outprobest, expectedoutprob, 0.001);
 317
 318 %!test
 319 %! sequence = {'A', 'B', 'A', 'A', 'A', 'B', 'B', 'A', 'B', 'C', 'C', 'C', 'C', 'B', 'C', 'A', 'A', 'A', 'A', 'C', 'C', 'B', 'C', 'A', 'C'};
 320 %! states = {'One', 'One', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'Two', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One'};
 321 %! symbols = {'A', 'B', 'C'};
 322 %! statenames = {'One', 'Two'};
 323 %! [transprobest, outprobest] = hmmestimate (sequence, states, 'symbols', symbols, 'statenames', statenames);
 324 %! expectedtransprob = [0.88889, 0.11111; 0.28571, 0.71429];
 325 %! expectedoutprob = [0.16667, 0.33333, 0.50000; 1.00000, 0.00000, 0.00000];
 326 %! assert (transprobest, expectedtransprob, 0.001);
 327 %! assert (outprobest, expectedoutprob, 0.001);
 328
 329 %!test
 330 %! sequence = [1, 2, 1, 1, 1, 2, 2, 1, 2, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 3];
 331 %! states = [1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1];
 332 %! pseudotransitions = [8, 2; 4, 6];
 333 %! pseudoemissions = [2, 4, 4; 7, 2, 1];
 334 %! [transprobest, outprobest] = hmmestimate (sequence, states, 'pseudotransitions', pseudotransitions, 'pseudoemissions', pseudoemissions);
 335 %! expectedtransprob = [0.85714, 0.14286; 0.35294, 0.64706];
 336 %! expectedoutprob = [0.178571, 0.357143, 0.464286; 0.823529, 0.117647, 0.058824];
 337 %! assert (transprobest, expectedtransprob, 0.001);
 338 %! assert (outprobest, expectedoutprob, 0.001);