/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                       Copyright (c) 1996,1997                         */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission to use, copy, modify, distribute this software and its    */
/*  documentation for research, educational and individual use only, is  */
/*  hereby granted without fee, subject to the following conditions:     */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*  This software may not be used for commercial purposes without        */
/*  specific prior written permission from the authors.                  */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author :  Alan W Black                                    */
/*             Date   :  May 1996                                        */
/*-----------------------------------------------------------------------*/
/*                                                                       */
/* Tree-based prediction of intonation.  Uses accent and end             */
/* tone prediction trees, could be ToBI could be something               */
/* else, its up to the trees to decide ...                               */
/*                                                                       */
/* Accents and boundaries are predicted by CART tree while               */
/* the F0 targets are predicted by linear regression (as                 */
/* described in Black and Hunt ICSLP96)                                  */
/*                                                                       */
/*=======================================================================*/
#include <stdio.h>
#include "festival.h"
#include "intonation.h"

enum lr_tpos {tp_start, tp_mid, tp_end};

static EST_String accent_specified(EST_Utterance &u,EST_Stream_Item &s);
static EST_String tone_specified(EST_Utterance &u,EST_Stream_Item &s);
static int after_pause(EST_Utterance &u,EST_Stream_Item &s);
static int before_pause(EST_Utterance &u,EST_Stream_Item &s);
static EST_Stream_Item &first_seg(EST_Utterance &u, EST_Stream_Item &syl);
static EST_Stream_Item &last_seg(EST_Utterance &u, EST_Stream_Item &syl);
static EST_Stream_Item &vowel_seg(EST_Utterance &u, EST_Stream_Item &syl);
static void init_int_lr_params(void);
static void add_target_at(float val,EST_Utterance &u,EST_Stream_Item &s,lr_tpos pos);
static float lr_calc(float *svals,LISP lr_model);
static void get_fvals(EST_Utterance &u, EST_Stream_Item &s,float *svals,LISP lr_model);

static float target_f0_mean = 0.0;
static float target_f0_std = 1.0;
static float model_f0_mean = 0.0;
static float model_f0_std = 1.0;

#define MZSCORE(X) (((X)-model_f0_mean)/model_f0_std)
#define UNTZSCORE(X) (((X)*target_f0_std)+target_f0_mean)
#define MAP_F0(X) (UNTZSCORE(MZSCORE(X)))

LISP FT_Intonation_Tree_Utt(LISP utt)
{
    // For each syllable predict intonation events.  Potentially
    // two forms, accents and ent tones
    EST_Utterance *u = GETUTTVAL(utt);
    EST_Stream_Item *s;
    EST_String paccent,ptone;
    LISP accent_tree, endtone_tree;

    u->create_stream("IntEvent");
    accent_tree = siod_get_lval("int_accent_cart_tree","no accent tree");
    endtone_tree = siod_get_lval("int_tone_cart_tree","no tone cart tree");

    for (s=u->stream("Syllable").head(); s != 0; s=next(s))
    {
	if ((paccent = accent_specified(*u,*s)) == "0") // pre-specified
	    paccent = wagon_predict(*u,*s,accent_tree);
	if (paccent != "NONE")
	    add_IntEvent_to_syl(*u,*s,paccent);
	if ((ptone = tone_specified(*u,*s)) == "0")
	    ptone = wagon_predict(*u,*s,endtone_tree);
	if (ptone != "NONE")
	    add_IntEvent_to_syl(*u,*s,ptone);
    }

    return utt;

}

static EST_String accent_specified(EST_Utterance &u,EST_Stream_Item &s)
{
    // If there is an explicit accent specifed on the related token
    // If there is check the syllable to see if its strees or a singleton
    EST_String paccent = ffeature(u,s,"Word.Token.accent");

    if (paccent == "0")
    {
	paccent = ffeature(u,s,"Word.accent");
	if (paccent == "0")
	    return paccent;
    }
    
    if (ffeature(u,s,"stress") == "1")
    {   // only goes on first stress syllable
	EST_Stream_Item *p;
	for (p=prev(&s); p != 0; p=prev(p))
	{
	    if (ffeature(u,*p,"syl_break") != "0")
		return paccent;
	    if (ffeature(u,*p,"stress") == "1")
		return "NONE";
	}
	return paccent;  // first word in utterance
    }
    else if (ffeature(u,s,"position_type") == "single")
	return paccent;
    else
	return "0";  // pre-specified but inappropriate syllable in word
}

static EST_String tone_specified(EST_Utterance &u,EST_Stream_Item &s)
{
    // If there is an explicit accent specifed on the related token
    // If there is check the syllable to see if its strees or a singleton
    EST_String ptone = ffeature(u,s,"Word.Token.tone");

    if (ptone == "0")
    {
	ptone = ffeature(u,s,"Word.tone");
	if (ptone == "0")
	    return ptone;
    }
    EST_String ptype = ffeature(u,s,"position_type");
    if ((ptype == "single") || (ptype == "final"))
	return ptone;
    else
	return "0";  // pre-specified but inappropriate syllable in word
}

LISP FT_Int_Targets_LR_Utt(LISP utt)
{
    // Predict F0 targets using Linear regression
    EST_Utterance *u = GETUTTVAL(utt);
    EST_Stream_Item *s;
    float pstart, pmid, pend;
    LISP start_lr, mid_lr, end_lr;

    init_int_lr_params();
    // Note the modals must *all* be the same size
    start_lr = siod_get_lval("f0_lr_start","no f0 start lr model");
    mid_lr = siod_get_lval("f0_lr_mid","no f0 mid lr model");
    end_lr = siod_get_lval("f0_lr_end","no f0 end lr model");
    float *svals = new float[siod_llength(start_lr)];
    
    u->create_stream("Target");
    pend = -1;

    for (s=u->stream("Syllable").head(); s != 0; s=next(s))
    {
	get_fvals(*u,*s,svals,start_lr);
	pstart = lr_calc(svals,start_lr);
	pstart = MAP_F0(pstart);
	if (after_pause(*u,*s))
	    add_target_at(pstart,*u,first_seg(*u,*s),tp_start);
	else
	    add_target_at((pstart+pend)/2.0,*u,first_seg(*u,*s),tp_start);
	pmid = lr_calc(svals,mid_lr);
	pmid = MAP_F0(pmid);
	add_target_at(pmid,*u,vowel_seg(*u,*s),tp_mid);
	pend = lr_calc(svals,end_lr);
	pend = MAP_F0(pend);
	if (before_pause(*u,*s))
	    add_target_at(pend,*u,last_seg(*u,*s),tp_end);
    }

    delete svals;
    
    return utt;

}

#define FFEATURE_NAME(X) (get_c_string(car(X)))
#define FFEATURE_WEIGHT(X) (get_c_float(car(cdr(X))))
#define FFEATURE_MAPCLASS(X) (car(cdr(cdr(X))))

static void get_fvals(EST_Utterance &u, EST_Stream_Item &s,float *svals,LISP lr_model)
{
    // Find the val for the required features and save in a table
    int i;
    LISP f;
    char *ffeature_name, *last_name="";
    EST_Val v=0.0;

    svals[0] = FFEATURE_WEIGHT(car(lr_model)); // Intercept;
    for (i=1,f=cdr(lr_model); CONSP(f); f=CDR(f),i++)
    {
	ffeature_name = FFEATURE_NAME(CAR(f));
	if (!streq(ffeature_name,last_name))
	    v = ffeature(u,s,ffeature_name);
	if (CDR(CDR(CAR(f)))) // A map class is specified
	{   
	    if (siod_member_str(v.string(),FFEATURE_MAPCLASS(CAR(f))) != NIL)
		svals[i] = 1.0;
	    else
		svals[i] = 0.0;
	}
	else
	    svals[i] = (float)v;
	last_name = ffeature_name;
    }
}

static float lr_calc(float *svals,LISP lr_model)
{
    // Find predictor from features and weights
    LISP f;
    int i;
    float answer = svals[0];
    
    for (i=1,f=cdr(lr_model); CONSP(f); f=CDR(f),i++)
	answer += svals[i] * FFEATURE_WEIGHT(CAR(f));

    return answer;
}

static void init_int_lr_params(void)
{
    LISP params;

    params = siod_get_lval("int_lr_params","no lr params");

    target_f0_mean = get_param_float("target_f0_mean",params,0.0);
    target_f0_std = get_param_float("target_f0_std",params,1.0);
    model_f0_mean = get_param_float("model_f0_mean",params,0.0);
    model_f0_std = get_param_float("model_f0_std",params,1.0);
}


static void add_target_at(float val,EST_Utterance &u,EST_Stream_Item &s,lr_tpos pos)
{
    // Add a target to segment at position
    EST_Stream_Item *targ;

    if (pos == tp_start)
	targ = add_target(u,s.start(),val);
    else if (pos == tp_mid)
	targ = add_target(u,(s.start()+s.end())/2.0,val);
    else if (pos == tp_end)
	targ = add_target(u,s.end(),val);
    else
    {
	cerr << "add_target_at: unknown position type\n";
	festival_error();
    }
    link(*targ,s);
}

static int after_pause(EST_Utterance &u,EST_Stream_Item &s)
{
    // TRUE is segment immediately previous to this is a silence

    if (prev(&first_seg(u,s)) == 0)
	return TRUE;
    else
	return ph_is_silence(prev(&first_seg(u,s))->name());
}

static int before_pause(EST_Utterance &u,EST_Stream_Item &s)
{
    // TRUE is segment immediately after this is a silence

    if (next(&last_seg(u,s)) == 0)
	return TRUE;
    else
	return ph_is_silence(next(&last_seg(u,s))->name());
}

static EST_Stream_Item &first_seg(EST_Utterance &u, EST_Stream_Item &syl)
{
    // return reference to first segment
    EST_Relation *segs = syl.link("Segment");

    if (segs->head() == 0)
    {
	cerr << "first_seg: Syllable has no segments\n";
	festival_error();
    }

    return u.ritem("Segment",(*segs)(segs->head()));
}

static EST_Stream_Item &last_seg(EST_Utterance &u, EST_Stream_Item &syl)
{
    // return reference to last segment
    EST_Relation *segs = syl.link("Segment");

    if (segs->tail() == 0)
    {
	cerr << "last_seg: Syllable has no segments\n";
	festival_error();
    }

    return u.ritem("Segment",(*segs)(segs->tail()));
}

static EST_Stream_Item &vowel_seg(EST_Utterance &u, EST_Stream_Item &syl)
{
    // return reference to vowel segment
    EST_Relation *segs = syl.link("Segment");
    EST_TBI *p;

    for (p=segs->head(); p != 0; p=next(p))
    {
	if (ph_is_vowel(u.ritem("Segment",(*segs)(p)).name()))
	    return u.ritem("Segment",(*segs)(p));
    }

    // If we can't find a vowel retrun the first segment
    if (segs->head() == 0)
    {
	cerr << "vowel_seg: Syllable has no segments\n";
	festival_error();
    }

    return u.ritem("Segment",(*segs)(segs->head()));
}
    
    
