/*
 *	Ohio Trollius
 *	Copyright 1996 The Ohio State University
 *	NJN
 *
 *	$Id: spawnmult.c,v 6.1 96/11/22 13:35:16 nevin Rel $
 *
 *	Function:	- spawn multiple MPI programs
 *	Accepts:	- number of command lines
 *			- array of program names
 *			- array of argument vectors
 *			- array of max. number of processes to start
 *			- array of info
 *			- root in spawning communicator
 *			- spawning communicator
 *			- intercomm between parents and children (out)
 *			- array of error codes (out)
 *	Returns:	- MPI_SUCCESS or error code
 */

#include <errno.h>
#include <stdlib.h>

#include <app_mgmt.h>
#include <app_schema.h>
#include <args.h>
#include <blktype.h>
#include <mpi.h>
#include <mpisys.h>
#include <ndi.h>
#include <net.h>
#include <portable.h>
#include <rpisys.h>
#include <terror.h>
#include <typical.h>

/*
 * private functions
 */
static int		spawn();
static LIST		*build_app();
static void		set_error_codes();


int
MPI_Spawn_multiple(count, commands, argvs,
			maxprocs, infos, root, comm, intercomm, errcodes)

int			count;
char			**commands;
char			***argvs;
int			*maxprocs;
MPI_Info		*infos;
int			root;
MPI_Comm 		comm;
MPI_Comm 		*intercomm;
int			*errcodes;

{
	MPI_Status	stat;
	MPI_Group	kgrp;			/* child group */
	LIST		*app;			/* application */
	struct _gps	*kids;			/* array of child GPS */
	struct _gps	*g;
	struct _proc	**p;
	int		rank;			/* caller rank */
	int		size;			/* group size */
	int		err;			/* error code */
	int		numkids;		/* num. of children spawned */
	int		mycid;			/* local max context ID */
	int		cid;			/* context ID for intercomm */
	int		msg[2];			/* two int message buffer */
	int		i;
	
	lam_initerr();
	lam_setfunc(BLKMPISPAWNMULT);
/*
 * Check the arguments.
 */
	if ((comm == MPI_COMM_NULL) || LAM_IS_INTER(comm)) {
		return(lam_errfunc(comm, BLKMPISPAWNMULT,
					lam_mkerr(MPI_ERR_COMM, 0)));
	}

	MPI_Comm_size(comm, &size);

	if ((root >= size) || (root < 0)) {
		return(lam_errfunc(comm, BLKMPISPAWNMULT,
				lam_mkerr(MPI_ERR_ROOT, 0)));
	}

	if (intercomm == 0) {
		return(lam_errfunc(MPI_COMM_WORLD,
			BLKMPISPAWNMULT, lam_mkerr(MPI_ERR_ARG, 0)));
	}

	LAM_TRACE(lam_tr_cffstart(BLKMPISPAWNMULT));
/*
 * Set debugging parameters.
 */
	g = &(comm->c_group->g_procs[root]->p_gps);

	lam_setparam(BLKMPISPAWNMULT, root | (g->gps_grank << 16),
				(g->gps_node << 16) | g->gps_idx);
/*
 * Synchronize all members of the parent group and get the context ID
 * for the parent-child intercommunicator.
 */
	MPI_Comm_rank(comm, &rank);

	mycid = lam_getcid();

	if (mycid < 0) {
		return(lam_errfunc(comm, BLKMPISPAWNMULT,
				lam_mkerr(MPI_ERR_INTERN, EFULL)));
	}

	err = MPI_Reduce(&mycid, &cid, 1, MPI_INT, MPI_MAX, root, comm);
	if (err != MPI_SUCCESS) {
		return(lam_errfunc(comm, BLKMPISPAWNMULT, err));
	}
	
	if (rank == root) {
/*
 * The root does the process spawning.
 */
		err = MPI_SUCCESS;
		
		app = build_app(count, commands, argvs, maxprocs, infos);
		if (app == 0) {
			err = lam_mkerr(MPI_ERR_OTHER, errno);
		} else {
			if (spawn(app, comm, cid, &numkids, &kids)) {
				err = lam_mkerr(MPI_ERR_SPAWN, 0);
			}
			set_error_codes(err, app, errcodes);
		}
/*
 * Inform parent group of spawn error.
 */
		if (err != MPI_SUCCESS) {
			msg[0] = -1; msg[1] = err;
			MPI_Bcast(msg, 2, MPI_INT, root, comm);
			return(lam_errfunc(comm, BLKMPISPAWNMULT, err));
		}
		
		msg[0] = cid; msg[1] = numkids;
	}
/*
 * Broadcast the context ID for the parent-child intercommunicator and the
 * number of children spawned to the parents.  In the case of an error
 * in spawning the root broadcasted an error code.
 */
	err = MPI_Bcast(msg, 2, MPI_INT, root, comm);
	if (err != MPI_SUCCESS) {
		return(lam_errfunc(comm, BLKMPISPAWNMULT, err));
	}
	
	if (rank != root) {

		cid = msg[0];
/*
 * A context ID of -1 means an error occurred in spawning so we
 * return with the error.
 */
		if (cid == -1) {
			err = msg[1];
			return(lam_errfunc(comm, BLKMPISPAWNMULT, err));
		}
/*
 * Allocate buffer to receive array of child GPS.
 */
		numkids = msg[1];
		kids = (struct _gps *)
			malloc((unsigned) (numkids * sizeof(struct _gps)));
		if (kids == 0) {
			return(lam_errfunc(comm, BLKMPISPAWNMULT,
					lam_mkerr(MPI_ERR_OTHER, errno)));
		}
	}
/*
 * Broadcast the array of child GPS to parent group.
 */
	err = MPI_Bcast(kids, numkids * sizeof(struct _gps) / sizeof(int),
			MPI_INT, root, comm);
	if (err != MPI_SUCCESS) {
		free((char *) kids);
		return(lam_errfunc(comm, BLKMPISPAWNMULT, err));
	}
/*
 * Create the child group.
 */
	kgrp = (MPI_Group) malloc((unsigned) sizeof(struct _group) +
					(numkids * sizeof(struct _proc *)));
	if (kgrp == 0) {
		free((char *) kids);
		return(lam_errfunc(comm, BLKMPISPAWNMULT,
					lam_mkerr(MPI_ERR_OTHER, errno)));
	}
	kgrp->g_nprocs = numkids;
	kgrp->g_myrank = MPI_UNDEFINED;
	kgrp->g_refcount = 1;
	kgrp->g_procs = (struct _proc **)
				((char *) kgrp + sizeof(struct _group));

	g = kids;
	p = kgrp->g_procs;

	for (i = 0; i < numkids; ++i, ++p, ++g) {

		if ((*p = lam_procadd(g)) == 0) {
			free((char *) kids);
			free((char *) kgrp);
			return(lam_errfunc(comm, BLKMPISPAWNMULT,
					lam_mkerr(MPI_ERR_OTHER, errno)));
		}
		(*p)->p_mode |= LAM_PCLIENT;
		(*p)->p_refcount++;
	}
/*
 * Create the parent-child intercommunicator.
 */
	*intercomm = 0;
	if (lam_comm_new(cid, comm->c_group, kgrp, LAM_CINTER, intercomm)) {
		free((char *) kids);
		free((char *) kgrp);
		return(lam_errfunc(comm, BLKMPISPAWNMULT,
				lam_mkerr(MPI_ERR_OTHER, errno)));
	}

	comm->c_group->g_refcount++;
	(*intercomm)->c_errhdl = comm->c_errhdl;
	comm->c_errhdl->eh_refcount++;

	if (!al_insert(lam_comms, intercomm)) {
		return(lam_errfunc(comm, BLKMPISPAWNMULT,
				lam_mkerr(MPI_ERR_INTERN, errno)));
	}

	if (lam_tr_comm(*intercomm)) {
		return(lam_errfunc(comm, BLKMPISPAWNMULT,
				lam_mkerr(MPI_ERR_INTERN, errno)));
	}

	lam_setcid(cid);
/*
 * setup new processes
 */
	if (RPI_SPLIT(_rpi_lamd_addprocs, _rpi_c2c_addprocs, ())) {
		return(lam_errfunc(comm, BLKMPISPAWNMULT,
				lam_mkerr(MPI_ERR_OTHER, errno)));
	}
/*
 * Wait until all the children have initialized.
 * The root waits for rank 0 in the child world to communicate this fact and
 * then broadcasts it to the other parents.
 */
	if (rank == root) {
		err = MPI_Recv((void *)0, 0, MPI_BYTE, 0, 0, *intercomm, &stat);
		if (err != MPI_SUCCESS) {
			return(lam_errfunc(comm, BLKMPISPAWNMULT, err));
		}
	}

	err = MPI_Bcast((void *) 0, 0, MPI_BYTE, root, comm);
	
	LAM_TRACE(lam_tr_cffend(BLKMPISPAWNMULT, root, comm, 0, 0));
	
	lam_resetfunc(BLKMPISPAWNMULT);
	return(MPI_SUCCESS);
}

/*
 *	spawn
 *
 *	Function:	- spawn MPI processes according to app. descriptor
 *	Accepts:	- application descriptor
 *			- parent communicator
 *			- context ID for parent-child intercommunicator
 *			- number of children (returned)
 *			- array of child GPS (returned)
 *	Returns:	- 0 or LAMERROR
 */
static int
spawn(app, comm, cid, numkids, kids)

LIST			*app;
MPI_Comm		comm;
int			cid;
int			*numkids;
struct _gps		**kids;

{
	struct nmsg	nhead;			/* network msg header */
	struct _proc	**g;			/* process in group */
	struct _gps	*world;			/* child world GPS array */
	struct _gps	*p;			/* process GPS */
	int4		rtf;			/* child runtime flags */
	int		rank;			/* my (spawner's) rank */
	int		parent_n;		/* size of parent world */
	int		world_n;		/* size of child world */
	int		i;
/*
 * Set environment inherited by children.  The world spawning them consists
 * solely of the parent group.
 */
	rtf = _kio.ki_rtf;
	rtf &= ~(RTF_MPIRUN | RTF_TRON | RTF_FLAT);

	MPI_Comm_size(comm, &parent_n);
/*
 * Allocate combined parent and child GPS array.
 */
	world_n = al_count(app);
	world = (struct _gps *)
		malloc((unsigned) (world_n + parent_n) * sizeof(struct _gps));
	if (world == 0) return(LAMERROR);
/*
 * Run the application.
 */
	if (asc_run(app, parent_n, rtf, 0, 0, world)) {
		free((char *) world);
		return(LAMERROR);
	}
/*
 * Fill in child ranks in their MPI_COMM_WORLD.
 */
	for (i = 0, p = world; i < world_n; ++i, ++p) {
		p->gps_grank = i;
	}
/*
 * Fill in the parent world GPS.
 */
	g = comm->c_group->g_procs;

	for (i = 0; i < parent_n; ++i, ++p, ++g) {
		*p = (*g)->p_gps;
	}
/*
 * Set up the message.
 */
	MPI_Comm_rank(comm, &rank);
	nhead.nh_type = 0;
	nhead.nh_flags = DINT4MSG;
	nhead.nh_msg = (char *) world;
	nhead.nh_length = (world_n + parent_n) * sizeof(struct _gps);
	nhead.nh_data[1] = (int4) cid;
	nhead.nh_data[2] = (int4) rank;
	nhead.nh_data[3] = (int4) lam_universe_size;
/*
 * Loop sending to each child process.
 */
	for (i = 0, p = world; i < world_n; ++i, ++p) {
		nhead.nh_node = p->gps_node;
		nhead.nh_event = - p->gps_pid;
		if (nsend(&nhead)) {
			free((char *) world);
			return(LAMERROR);
		}
	}
	
	*numkids = world_n;
	*kids = world;
	return(0);
}

/*
 *	build_app
 *
 *	Function:	- build an application
 *	Accepts:	- number of command lines
 *			- array of program names
 *			- array of argument vectors
 *			- array of max. number of processes to start
 *			- array of info
 *	Returns:	- application descriptor or 0
 */
static LIST *
build_app(count, commands, argvs, maxprocs, infos)

int			count;
char			**commands;
char			***argvs;
int			*maxprocs;
MPI_Info		*infos;

{
	LIST		*app;			/* application */
	LIST		*app_sched;		/* scheduled application */
	struct apparg 	*pargv;			/* process argv */
	struct aschema	proc;			/* process list entry */
	int		argv_n;			/* number args in argv */
	int		i, j;
/*
 * Create new empty application.
 */
	if ((app = al_init(sizeof(struct aschema), (int (*)()) 0)) == 0) {
		return(0);
	}
/*
 * Loop through the command lines adding them to the application.
 */
	for (i = 0; i < count; i++) {
		proc.asc_nodelist = 0;
		proc.asc_proc_cnt = maxprocs[i];
		proc.asc_srcnode = -1;
/*
 * Build the argument vector.
 */
		pargv = (struct apparg *) malloc(sizeof(struct apparg));
		if (pargv == 0) {
			asc_free(app);
			return(0);
		}
		
		pargv->apa_argv = 0;
		pargv->apa_argc = 0;
		pargv->apa_refcount = 1;
/*
 * Add the command name.
 */
		if (argvadd(&pargv->apa_argc, &pargv->apa_argv, commands[i])) {
			asc_free(app);
			argvfree(pargv->apa_argv);
			free((char *) pargv);
			return(0);
		}
/*
 * Add the argument vector for this command.
 */
		if (argvs != MPI_ARGVS_NULL) {
			argv_n = argvcount(argvs[i]);
			for (j = 0; j < argv_n; j++) {
				if (argvadd(&pargv->apa_argc, &pargv->apa_argv, 
						argvs[i][j])) {
					asc_free(app);
					argvfree(pargv->apa_argv);
					free((char *) pargv);
					return(0);
				}
			}
		}

		proc.asc_args = pargv;

		if (al_append(app, &proc) == 0) {
			asc_free(app);
			argvfree(pargv->apa_argv);
			free((char *) pargv);
			return(0);
		}
	}
/*
 * Schedule the application.
 */
	app_sched = asc_schedule(app);
	asc_free(app);
	
	return(app_sched);
}

/*
 *	set_error_codes
 *
 *	Function:	- set error codes for spawned processes
 *	Accepts:	- spawn error code
 *			- application
 *			- array of error codes (out)
 */
static void
set_error_codes(err, app, errs)

int			err;
LIST			*app;
int			*errs;

{
	int		i;
	struct aschema  *p;

	if (errs == MPI_ERRCODES_DONTCARE) {
		return;
	}

	if (err) {
		for (p = al_top(app); p; p = al_next(app, p), errs++) {
			*errs = lam_mkerr(MPI_ERR_SPAWN, p->asc_errno);
		}
	} else {
		for (i = al_count(app) - 1; i >= 0; i--) {
			errs[i] = MPI_SUCCESS;
		}
	}
}
