/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                     Copyright (c) 1995,1996                           */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission to use, copy, modify, distribute this software and its    */
/*  documentation for research, educational and individual use only, is  */
/*  hereby granted without fee, subject to the following conditions:     */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*  This software may not be used for commercial purposes without        */
/*  specific prior written permission from the authors.                  */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/

#ifndef __EST_SIGPR_UTT_H__
#define __EST_SIGPR_UTT_H__

#include "sigpr/EST_sigpr_frame.h"
#include "sigpr/EST_Window.h"
#include "EST_Track.h"
#include "EST_Wave.h"

#define DEFAULT_WINDOW_NAME "hamming"
#define DEFAULT_FRAME_FACTOR 2.0

/**@name Utterance-based signal processing functions

This family of functions produces a signal processing on an entire
waveform and stores the output in a track. 

Speech is often termed "quasi-stationary" which means that although the
speech signal is constantly varying, over short intervals it can be
considered stationary. So processing an utterance of speech involves
dividing it up into small sections, {\it frames}, upon which the
signal processing functions described in 
\Ref{Frame based signal processing functions} can operate.

In most of the following functions, the input is a \Ref{EST_Wave}
waveform, and the output is a (usually multi-channel) \Ref{EST_Track}.
The EST_Track must be of the correct size before the function is
called. 

The positions of the frames are found by examination of the {\bf time}
array in the EST_Track, which must be filled prior to the function
call. The usual requirement is for fixed frame analysis, where each
analysis frame is, say, 10ms after the previous one. The
\Ref{make_fixed_times(EST_Track &pm, EST_Wave &sig, float shift)} function will fill the time array at a specified
frame shift. A common alternative is to perform pitch-synchronous
analysis where the time shift is related to the local pitch period.


@see Frame based signal processing functions

*/ //@{

/** Convert a track containing coefficents of one type to a track
containing coefficients of another.

@param in_track input set of coefficients
@param out_track input set of coefficients
@param out_name name of desired output coefficients.
@param in_name optional: often it is possible to determin the type of 
the input coefficients from the channel names. If this is not possible or
these names should be ignored, the {\tt in_type} parameter can be used.

*/

void convert_track(EST_Track &in_track, EST_Track &out_track,
		   const EST_String &out_type, 
		   const EST_String &in_type = "");


/** Produce a set of coefficents for a waveform.

This is a general functions which performs frame-by-frame analysis on
a waveform and calls a frame based signal processing function to perform
some analysis.


@param sig input waveform
@param a output coefficients. These have been pre-allocated and the
       number of channels in a indicates the order of the analysis.
@param type the types of coefficients to be produced. "lpc", "cep" etc etc


@param factor optional: the frame length factor, i.e. the analysis frame length
       will be this times the local pitch period.

@param wf optional: function for windowing. See \Ref{Windowing mechanisms}
*/

void sig2coef(EST_Wave &sig, EST_Track &a, EST_String type, 
	      float factor = 2.0, 
	      EST_WindowFunc *wf = EST_Window::creator(DEFAULT_WINDOW_NAME));

/** Calculate the power for each frame of the waveform.

@param sig input waveform
@param a output coefficients
@param factor optional: the frame length factor, i.e. the analysis frame length
       will be this times the local pitch period.
*/

void power(EST_Wave &sig, EST_Track &a, float factor);

/** Calculate the rms energy for each frame of the waveform.

This function calls
\Ref{sig2energy}


@param sig input waveform
@param a output coefficients
@param factor optional: the frame length factor, i.e. the analysis frame length
       will be this times the local pitch period.

*/

void energy(EST_Wave &sig, EST_Track &a, float factor);

/** Produce a set of {\it\bf delta} coefficents for track {\tt tr}.

The delta function is used to produce a set of coefficients which
estimate the rate of change of a set of parameters. 
*/

void delta(EST_Track &tr, EST_Track &d, int regression_length = 3);

/** Return the frame size in {\bf samples} based on analysis of
current time points.  This function basically determines the local
frame size (shift) by subtracting the current time point from the next
time point. If the {\tt prefer_prev} flag is set to {\tt true}, or the
index is the last in the track, the size is determined by subtracting
the previous time point from the current one.

This is most commonly used in pitch synchronous analysis to determine
the local pitch period.

@see get_time_frame_size
*/
int get_frame_size(EST_Track &pms, int current_pos, int sample_rate, 
			 int prefer_prev=0);

/** Return the frame size in {\bf seconds} based on analysis of
current time points.  This function basically determines the local
frame size (shift) by subtracting the current time point from the next
time point. If the {\tt prefer_prev} flag is set to {\tt true}, or the
index is the last in the track, the size is determined by subtracting
the previous time point from the current one.

This is most commonly used in pitch synchronous analysis to determine
the local pitch period.

@see get_frame_size
*/


float get_time_frame_size(EST_Track &pms, int i, int prefer_prev = 0);


/**@name Pitch/F0 Detection Algorithm functions

These functions are used to produce a track of fundamental frequency
(F0) against time of a waveform.
*/

//@{   

/** Top level pitch (F0) detection algorithm. Returns a track
conatining evenly spaced frames of speech, each containing a F0 value
for that point.
@param sig is the waveform to be processed
@param op contains the options regarding pitch tracking parameters
@param method is the pda method to be used, eg. "srpd"
@see icda
*/

void pda(EST_Wave &sig, EST_Track &fz, EST_Option &op, EST_String method="");


/** Top level intonation contour detection algorithm. Returns a track
conatining evenly spaced frames of speech, each containing a F0 for that point. {\tt icda} differs from \Ref{pda} in that the contour is
smoothed, and unvoiced portions have interpolated F0
values.

@param speech Interpolation is controlled by the <tt>speech</tt> track. When
a point has a positive value in the speech track, it is a candidate
for interpolation.  

@param  sig is the waveform to be processed
@param  op contains the options regarding pitch tracking parameters
@param  method is the pda method to be used.
@see pda
*/

void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech, 
	  EST_Option &op, EST_String method = "");

/** set sensible defaults for used in pda and icda.
*/
void default_pda_options(EST_Option &al);

/** Use super resolution pitch tracker with default options for pda. 
*/
void do_srpd_fz(EST_Wave &sig, EST_Track &fz);

/** Use super resolution pitch tracker for pda. 

*/
void do_srpd_fz(EST_Wave &sig, EST_Track &fz, EST_Option &options);

/** Smooth selected parts of an f0 contour.  Interpolation is
controlled by the <tt>speech</tt> track. When a point has a positive
value in the speech track, it is a candidate for interpolation.  
*/
void smooth_phrase(EST_Track &c, EST_Track &speech, EST_Option &options, 
		   EST_Track &sm);

/** Smooth all the points in an F0 contour*/
void smooth_portion(EST_Track &c, EST_Option &op);

//@}


/**@name Filter bank and cepstral analysis
*/

//@{


/** Mel scale filter bank analysis. The Mel scale triangular filters
are computed via an FFT (see \Ref{fastFFT}). This routine is required
for Mel cepstral analysis (see \Ref{melcep}). The analysis of each
frame is done by \Ref{sig2fbank}.

A typical filter bank analysis for speech recognition might use log
energy outputs from 20 filters.

@param sig input waveform
@param fbank the output. The number of filters is determined from the number
       size of this track.
@param factor optional: the frame length factor, i.e. the analysis frame length
       will be this times the local pitch period
@param wf optional: function for windowing. See \Ref{Windowing mechanisms}
@param use_power_rather_than_energy whether the filterbank analysis should use
       power rather than energy.
@param take_log whether to take logs of the filter outputs

@see sig2fbank
@see melcep
*/

void fbank(EST_Wave &sig,
	   EST_Track &fbank,
	   const float factor,
	   EST_WindowFunc *wf = EST_Window::creator(DEFAULT_WINDOW_NAME),
	   const bool use_power_rather_than_energy = false,
	   const bool take_log = true);

/** Mel scale cepstral analysis via filter bank analysis. Cepstral
parameters are computed for each frame of speech. The analysis
requires \Ref{fbank}. The cepstral analysis of the filterbank outputs
is performed by \Ref{fbank2melcep}.

A typical Mel cepstral coefficient (MFCC) analysis for speech recognition
might use 12 cepstral coefficients computed from a 20 channel filterbank.


@param sig input waveform
@param mfcc_track the output
@param factor optional: the frame length factor, i.e. the analysis frame length
       will be this times the local pitch period
@param fbank_order the number of Mel scale filters used for the analysis
@param liftering_parameter for filtering in the cepstral domain
       See \Ref{fbank2melcep}
@param wf optional: function for windowing. See \Ref{Windowing mechanisms}
@param include_c0 whether the zero'th cepstral coefficient is to be included
@param use_power_rather_than_energy whether the filterbank analysis should use
       power rather than energy.

@see fbank
@see fbank2melcep
*/

void melcep(EST_Wave &sig, 
	    EST_Track &mfcc_track, 
	    float factor,
	    int fbank_order,
	    float liftering_parameter,
	    EST_WindowFunc *wf = EST_Window::creator(DEFAULT_WINDOW_NAME),
	    const bool include_c0 = false,
	    const bool use_power_rather_than_energy = false);

//@}
// end of filter bank and cepstral analysis

//@}

#endif /* __EST_SIGPR_UTT_H__ */

