/* * Copyright (c) 2009-2011 The Trustees of Indiana University. * All rights reserved. * * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #include #include "opal/util/output.h" #include "opal/dss/dss.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" #include "orte/util/show_help.h" #include "orte/util/threads.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls_types.h" #include "orte/mca/state/state.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" #include "errmgr_default_tool.h" /* * Module functions: Global */ static int init(void); static int finalize(void); static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs, int error_code); /****************** * HNP module ******************/ orte_errmgr_base_module_t orte_errmgr_default_tool_module = { .init= init, .finalize = finalize, .logfn = orte_errmgr_base_log, .abort = orte_errmgr_base_abort, .abort_peers = abort_peers }; static void proc_errors(int fd, short args, void *cbdata); /************************ * API Definitions ************************/ static int init(void) { /* setup state machine to trap proc errors */ orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); return ORTE_SUCCESS; } static int finalize(void) { return ORTE_SUCCESS; } static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; ORTE_ACQUIRE_OBJECT(caddy); OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_tool: proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&caddy->name), orte_proc_state_to_str(caddy->proc_state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { OBJ_RELEASE(caddy); return; } /* if we lost our lifeline, then just stop the event loop * so the main program can cleanly terminate */ if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) { ORTE_POST_OBJECT(caddy); orte_event_base_active = false; } else { /* all other errors require abort */ orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL); } OBJ_RELEASE(caddy); } static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs, int error_code) { /* just abort */ if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) { orte_errmgr_base_abort(error_code, "%s called abort_peers", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } else { orte_errmgr_base_abort(error_code, NULL); } return ORTE_SUCCESS; }