1 ## Copyright (C) 1999-2001 Paul Kienzle <pkienzle@users.sf.net>
3 ## This program is free software; you can redistribute it and/or modify it under
4 ## the terms of the GNU General Public License as published by the Free Software
5 ## Foundation; either version 3 of the License, or (at your option) any later
8 ## This program is distributed in the hope that it will be useful, but WITHOUT
9 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 ## You should have received a copy of the GNU General Public License along with
14 ## this program; if not, see <http://www.gnu.org/licenses/>.
16 ## usage: [S [, f [, t]]] = specgram(x [, n [, Fs [, window [, overlap]]]])
18 ## Generate a spectrogram for the signal. This chops the signal into
19 ## overlapping slices, windows each slice and applies a Fourier
20 ## transform to determine the frequency components at that slice.
22 ## x: vector of samples
23 ## n: size of fourier transform window, or [] for default=256
24 ## Fs: sample rate, or [] for default=2 Hz
25 ## window: shape of the fourier transform window, or [] for default=hanning(n)
26 ## Note: window length can be specified instead, in which case
27 ## window=hanning(length)
28 ## overlap: overlap with previous window, or [] for default=length(window)/2
31 ## S is complex output of the FFT, one row per slice
32 ## f is the frequency indices corresponding to the rows of S.
33 ## t is the time indices corresponding to the columns of S.
34 ## If no return value is requested, the spectrogram is displayed instead.
37 ## x = chirp([0:0.001:2],0,2,500); # freq. sweep from 0-500 over 2 sec.
38 ## Fs=1000; # sampled every 0.001 sec so rate is 1 kHz
39 ## step=ceil(20*Fs/1000); # one spectral slice every 20 ms
40 ## window=ceil(100*Fs/1000); # 100 ms data window
41 ## specgram(x, 2^nextpow2(window), Fs, window, window-step);
43 ## ## Speech spectrogram
44 ## [x, Fs] = auload(file_in_loadpath("sample.wav")); # audio file
45 ## step = fix(5*Fs/1000); # one spectral slice every 5 ms
46 ## window = fix(40*Fs/1000); # 40 ms data window
47 ## fftn = 2^nextpow2(window); # next highest power of 2
48 ## [S, f, t] = specgram(x, fftn, Fs, window, window-step);
49 ## S = abs(S(2:fftn*4000/Fs,:)); # magnitude in range 0<f<=4000 Hz.
50 ## S = S/max(S(:)); # normalize magnitude so that max is 0 dB.
51 ## S = max(S, 10^(-40/10)); # clip below -40 dB.
52 ## S = min(S, 10^(-3/10)); # clip above -3 dB.
53 ## imagesc(t, f, flipud(log(S))); # display in log scale
55 ## The choice of window defines the time-frequency resolution. In
56 ## speech for example, a wide window shows more harmonic detail while a
57 ## narrow window averages over the harmonic detail and shows more
58 ## formant structure. The shape of the window is not so critical so long
59 ## as it goes gradually to zero on the ends.
61 ## Step size (which is window length minus overlap) controls the
62 ## horizontal scale of the spectrogram. Decrease it to stretch, or
63 ## increase it to compress. Increasing step size will reduce time
64 ## resolution, but decreasing it will not improve it much beyond the
65 ## limits imposed by the window size (you do gain a little bit,
66 ## depending on the shape of your window, as the peak of the window
67 ## slides over peaks in the signal energy). The range 1-5 msec is good
70 ## FFT length controls the vertical scale. Selecting an FFT length
71 ## greater than the window length does not add any information to the
72 ## spectrum, but it is a good way to interpolate between frequency
73 ## points which can make for prettier spectrograms.
75 ## After you have generated the spectral slices, there are a number of
76 ## decisions for displaying them. First the phase information is
77 ## discarded and the energy normalized:
79 ## S = abs(S); S = S/max(S(:));
81 ## Then the dynamic range of the signal is chosen. Since information in
82 ## speech is well above the noise floor, it makes sense to eliminate any
83 ## dynamic range at the bottom end. This is done by taking the max of
84 ## the magnitude and some minimum energy such as minE=-40dB. Similarly,
85 ## there is not much information in the very top of the range, so
86 ## clipping to a maximum energy such as maxE=-3dB makes sense:
88 ## S = max(S, 10^(minE/10)); S = min(S, 10^(maxE/10));
90 ## The frequency range of the FFT is from 0 to the Nyquist frequency of
91 ## one half the sampling rate. If the signal of interest is band
92 ## limited, you do not need to display the entire frequency range. In
93 ## speech for example, most of the signal is below 4 kHz, so there is no
94 ## reason to display up to the Nyquist frequency of 10 kHz for a 20 kHz
95 ## sampling rate. In this case you will want to keep only the first 40%
96 ## of the rows of the returned S and f. More generally, to display the
97 ## frequency range [minF, maxF], you could use the following row index:
99 ## idx = (f >= minF & f <= maxF);
101 ## Then there is the choice of colormap. A brightness varying colormap
102 ## such as copper or bone gives good shape to the ridges and valleys. A
103 ## hue varying colormap such as jet or hsv gives an indication of the
104 ## steepness of the slopes. The final spectrogram is displayed in log
105 ## energy scale and by convention has low frequencies on the bottom of
108 ## imagesc(t, f, flipud(log(S(idx,:))));
110 function [S_r, f_r, t_r] = specgram(x, n = min(256, length(x)), Fs = 2, window = hanning(n), overlap = ceil(length(window)/2))
112 if nargin < 1 || nargin > 5
114 ## make sure x is a vector
115 elseif columns(x) != 1 && rows(x) != 1
116 error ("specgram data must be a vector");
118 if columns(x) != 1, x = x'; end
120 ## if only the window length is given, generate hanning window
121 if length(window) == 1, window = hanning(window); end
123 ## should be extended to accept a vector of frequencies at which to
124 ## evaluate the fourier transform (via filterbank or chirp
127 error("specgram doesn't handle frequency vectors yet");
130 ## compute window offsets
131 win_size = length(window);
134 warning ("specgram fft size adjusted to %d", n);
136 step = win_size - overlap;
138 ## build matrix of windowed data slices
139 offset = [ 1 : step : length(x)-win_size ];
140 S = zeros (n, length(offset));
141 for i=1:length(offset)
142 S(1:win_size, i) = x(offset(i):offset(i)+win_size-1) .* window;
145 ## compute fourier transform
148 ## extract the positive frequency components
156 f = [0:ret_n-1]*Fs/n;
159 imagesc(t, f, 20*log10(abs(S)));
160 set (gca (), "ydir", "normal");
164 if nargout>0, S_r = S; endif
165 if nargout>1, f_r = f; endif
166 if nargout>2, t_r = t; endif
172 %! x = chirp([0:1/Fs:2],0,2,500); # freq. sweep from 0-500 over 2 sec.
173 %! step=ceil(20*Fs/1000); # one spectral slice every 20 ms
174 %! window=ceil(100*Fs/1000); # 100 ms data window
175 %! [S, f, t] = specgram(x);
177 %! ## test of returned shape
178 %!assert (rows(S), 128)
179 %!assert (columns(f), rows(S))
180 %!assert (columns(t), columns(S))
181 %!test [S, f, t] = specgram(x');
182 %!assert (rows(S), 128)
183 %!assert (columns(f), rows(S));
184 %!assert (columns(t), columns(S));
185 %!error (isempty(specgram([])));
186 %!error (isempty(specgram([1, 2 ; 3, 4])));
191 %! x = chirp([0:1/Fs:2],0,2,500); # freq. sweep from 0-500 over 2 sec.
192 %! step=ceil(20*Fs/1000); # one spectral slice every 20 ms
193 %! window=ceil(100*Fs/1000); # 100 ms data window
195 %! ## test of automatic plot
196 %! [S, f, t] = specgram(x);
197 %! specgram(x, 2^nextpow2(window), Fs, window, window-step);
198 %! disp("shows a diagonal from bottom left to top right");
199 %! input("press enter:","s");
201 %! ## test of returned values
202 %! S = specgram(x, 2^nextpow2(window), Fs, window, window-step);
203 %! imagesc(20*log10(flipud(abs(S))));
204 %! disp("same again, but this time using returned value");
207 %! ## Speech spectrogram
208 %! [x, Fs] = auload(file_in_loadpath("sample.wav")); # audio file
209 %! step = fix(5*Fs/1000); # one spectral slice every 5 ms
210 %! window = fix(40*Fs/1000); # 40 ms data window
211 %! fftn = 2^nextpow2(window); # next highest power of 2
212 %! [S, f, t] = specgram(x, fftn, Fs, window, window-step);
213 %! S = abs(S(2:fftn*4000/Fs,:)); # magnitude in range 0<f<=4000 Hz.
214 %! S = S/max(max(S)); # normalize magnitude so that max is 0 dB.
215 %! S = max(S, 10^(-40/10)); # clip below -40 dB.
216 %! S = min(S, 10^(-3/10)); # clip above -3 dB.
217 %! imagesc(flipud(20*log10(S)));
219 %! % The image contains a spectrogram of 'sample.wav'