Nsound  0.9.4
cepstral_pitch_estimate.cc
Go to the documentation of this file.
1 //-----------------------------------------------------------------------------
2 //
3 // $Id: cepstral_pitch_estimate.cc 878 2014-11-23 04:51:23Z weegreenblobbie $
4 //
5 //-----------------------------------------------------------------------------
6 
7 // Include the Nsound headers
8 #include <Nsound/NsoundAll.h>
9 
10 #include <iostream>
11 
12 using std::cout;
13 using std::cerr;
14 using std::endl;
15 
16 using namespace Nsound;
17 
18 int
19 my_main(void)
20 {
21  Plotter pylab;
22 
23  // Region of the spectrogram to pull out for cepstral analysis.
24  float64 FMIN = 80.0;
25  float64 FMAX = 2000.0;
26 
27  // The maximum pitch value to consider when searching for the peak in the
28  // cepstrum analysis.
29  float64 PMAX = 600.0;
30 
31  // Read in the target wavefile.
32 
33  std::string input_filename = "walle.wav";
34 
35  AudioStream a(input_filename);
36 
37  // Since the sample is speech, I can downsample it to 8 kHz to speed up
38  // the calculations. Most of the harmonic information is less than 4 kHz.
39 
40  float64 sr = a.getSampleRate();
41 
42  float64 ratio = 8000.0 / sr;
43 
44  a.resample(ratio);
45 
46  sr = 8000.0;
47  a.setSampleRate(static_cast<uint32>(sr));
48 
49  // Calculate the STFT a.k.a. the spectrogram.
50 
51  Spectrogram spec(a[0], sr, 0.060, 0.010, HANNING);
52 
53  Buffer freq_axis = spec.getFrequencyAxis();
54 //~ float64 df = freq_axis[1] - freq_axis[0];
55 
56  Buffer time_axis = spec.getTimeAxis();
57  uint32 n_time_steps = time_axis.getLength();
58 
59  AudioStream magnitude = spec.getMagnitude();
60 
61  // N = FFT size.
62  uint32 N = magnitude.getLength() * 2;
63 
64  // Cepstrum size.
65  uint32 M = 2 * N;
66 
67  cerr << "N = " << N << endl;
68  cerr << "M = " << M << endl;
69  cerr << "M/2 + 1 = " << (M / 2 + 1) << endl;
70 
71  cerr << "spectrum.size = " << magnitude.getNChannels() << endl;
72 
73  // Determine the indicies of the spectrogram to pull out for cepstral
74  // analysis.
75 
76  float64 vmin = freq_axis[0];
77  float64 vstep = freq_axis[1] - vmin;
78 
79  uint32 fmin_index = static_cast<uint32>( (FMIN - vmin) / vstep + 0.5);
80  uint32 fmax_index = static_cast<uint32>( (FMAX - vmin) / vstep + 0.5);
81 
82  Generator gen(1);
83 
84  Buffer n = gen.drawLine(M / 2 + 1, 0.0, M / 2 + 1);
85 
86  n[0] = 0.1;
87 
88  Buffer quefrency_axis = (N * n) / (M * sr);
89 
90  Buffer pitch_axis = 1.0 / quefrency_axis;
91 
92  cerr << "quefrency_axis.getLength() = " << quefrency_axis.getLength() << endl;
93  cerr << "pitch_axis.getLength() = " << pitch_axis.getLength() << endl;
94 
95  // Find the index for PMAX
96 
97  uint32 pmax_index = (pitch_axis - PMAX).getAbs().argmin();
98 
99  // Get pitch_axis range of interest.
100  pitch_axis = pitch_axis.subbuffer(pmax_index);
101 
102  // Extract the pitch for each time step.
103 
104  FFTransform fft(sr); // really the sample rate is never used for anything.
105 
106  // Create a Hanning window
107  Buffer window = gen.drawWindowHanning(M / 2 + 1);
108 
109  Buffer pitch_path;
110 
111  boolean once = true;
112 
113  AudioStream cep_space(1.0, n_time_steps);
114 
115  for(uint32 i = 0; i < n_time_steps; ++i)
116  {
117  Buffer spectrum = magnitude[i];
118 
119  // Pull out region of the spectrum from FMIN to FMAX
120  spectrum = spectrum.subbuffer(fmin_index, fmax_index - fmin_index);
121 
122  // Convert to dB.
123  spectrum = 10.0 * (1.0 + spectrum).getdB();
124 
125 //~ // Subtract mean and window
126 //~ spectrum -= spectrum.getMean();
127 //~ spectrum *= window;
128 
129  // Cepstral
130  FFTChunkVector vec = fft.fft(spectrum, M);
131 
132  Buffer cepstrum = vec[0].getMagnitude();
133 
134  if(once)
135  {
136  once = false;
137  cerr << "cepstrum.getLength() = " << cepstrum.getLength() << endl;
138  }
139 
140  // Pull out region of interest.
141  cepstrum = cepstrum.subbuffer(pmax_index);
142 
143  uint32 p_index = cepstrum.argmax();
144 
145  pitch_path << pitch_axis[p_index];
146 
147  cep_space[i] << cepstrum;
148  }
149 
150  // Plot spectrogram + 20 harmonics
151 
152  spec.plot(input_filename);
153 
154  for(float64 i = 1.0; i <= 20.0; i += 1.0)
155  {
156  pylab.plot(time_axis, i * pitch_path, "r+-");
157  }
158 
159  // Set some limits
160  pylab.ylim(0, 2000.0);
161  pylab.xlim(0.4, 1.6);
162 
163  pylab.title("Spectrogram + Estimated Pitch Path");
164 
165  // Plot the cepstral matrix, where the peak was pulled out for the pitch
166  // estimate.
167 
168  pylab.figure();
169 
170  Buffer bin_axis = gen.drawLine(
171  cep_space.getLength(), 0, cep_space.getLength());
172 
173  cep_space.transpose();
174 
175  cout << "cep_space.getLength() = " << cep_space.getLength() << endl;
176  cout << "cep_space.getNChannels() = " << cep_space.getNChannels() << endl;
177  cout << "time_axis.getLength() = " << time_axis.getLength() << endl;
178  cout << "bin_axis.getLength() = " << bin_axis.getLength() << endl;
179 
180  pylab.imagesc(time_axis, bin_axis, cep_space);
181  pylab.title("Cepstrum");
182  pylab.xlabel("Time (s)");
183  pylab.ylabel("Cepstral bin (not frequency!)");
184  // Set some limits
185  pylab.xlim(0.4, 1.6);
186 
187  Plotter::show();
188 
189  return 0;
190 }
191 
192 int main(int /*argc*/, char ** /*argv*/)
193 {
194  try
195  {
196  my_main();
197  return 0;
198  }
199  catch(std::exception & e)
200  {
201  cerr << "Exception: " << e.what() << endl;
202  return 1;
203  }
204 
205  return 0;
206 }
Buffer subbuffer(uint32 start_index, uint32 n_samples=0) const
Slice the Buffer.
Definition: Buffer.cc:2073
unsigned int uint32
Definition: Nsound.h:153
void xlabel(const std::string &label, const std::string &kwargs="")
Add a label x axis.
Definition: Plotter.cc:1154
int main(int, char **)
static void show()
Acutally draw the plots to the screen.
Definition: Plotter.cc:252
Buffer getTimeAxis() const
Definition: Spectrogram.cc:272
void plot(const Buffer &y, const std::string &fmt="", const std::string &kwargs="")
Plots the Buffer on the current figure.
Definition: Plotter.cc:765
float64 getSampleRate() const
Returns the sample rate of the stream.
Definition: AudioStream.h:217
uint32 getLength() const
Returns the number of samples of audio data in the stream.
Definition: AudioStream.cc:197
void figure(const std::string &kwargs="") const
Creates a new figure window to plot in.
Definition: Plotter.cc:455
void title(const std::string &title, const std::string &kwargs="")
Add a title to the plot at the top and centered.
Definition: Plotter.cc:1127
void xlim(const float64 &xmin, const float64 &xmax)
Sets the limit for the x & y axis.
Definition: Plotter.cc:389
double float64
Definition: Nsound.h:146
Buffer drawWindowHanning(const float64 &duration) const
Draws a Hanning window.
Definition: Generator.cc:819
uint32 getLength() const
Returns the number of samples in the Buffer.
Definition: Buffer.h:587
The result from an STFT.
Definition: Spectrogram.h:47
void ylim(const float64 &ymin, const float64 &ymax)
Definition: Plotter.cc:422
uint32 getNChannels(void) const
Returns the number of audio channels in the stream.
Definition: AudioStream.h:212
void imagesc(const AudioStream &Z, const std::string &kwargs="")
Plots the AudioStream like a 2D matrix.
Definition: Plotter.cc:593
Buffer getFrequencyAxis() const
Definition: Spectrogram.cc:258
void transpose()
Treating the AudioStream as a matrix, this peforms a matrix transpose.
Definition: AudioStream.cc:875
void ylabel(const std::string &label, const std::string &kwargs="")
Add a label y axis.
Definition: Plotter.cc:1180
A Class that performes the Fast Fouier Transfrom on a Buffer.
Definition: FFTransform.h:57
int my_main(void)
A Buffer for storing audio samples.
Definition: Buffer.h:60
Buffer fft(const Buffer &time_domain) const
Transforms the time_domain signal and calculates the FFT.
Definition: FFTransform.cc:50
Buffer drawLine(const float64 &duration, const float64 &amplitude_start, const float64 &amplitude_finish) const
This method draws a linear line beteween 2 points.
Definition: Generator.cc:464
void plot(const std::string &title="", const boolean &use_dB=true, const float64 &squash=0.5) const
Definition: Spectrogram.cc:301
AudioStream getMagnitude() const
Definition: Spectrogram.cc:265
void setSampleRate(uint32 sample_rate)
Definition: AudioStream.h:483
std::vector< FFTChunk > FFTChunkVector
Definition: FFTChunk.h:119
A class the provides draw utilities and a wavetable oscillator.
Definition: Generator.h:50
void resample(float64 factor)
Resample by a non-integer factor.
Definition: AudioStream.cc:706
float64 sr
Definition: example3.cc:24
uint32 argmax() const
Retruns the index of the maximum value in the Buffer.
Definition: Buffer.cc:460