#!/usr/bin/bash
#
# SAPHanaSR-alert-fencing
# Author:       Lars Pinne Fabian Herschel, June 2024
# Support:      linux@sap.com
# License:      GNU General Public License (GPL)
# Copyright:    (c) 2024 SUSE LLC
# Version:      2025-07-17
#
# Configure the alert with crmsh:
#
# crm configure alert nodes-1 "/usr/bin/SAPHanaSR-alert-fencing" select nodes
# crm configure alert fencing-1 "/usr/bin/SAPHanaSR-alert-fencing" select fencing attributes alert_uptime_threshold=300
#
# Configure the alert with pcs:
#
# pcs alert create id=alert-hana-1 path=/usr/bin/SAPHanaSR-alert-fencing options alert_uptime_threshold=300
# cibadmin -CMX '<configuration> <alerts> <alert id="alert-hana-1"> <select> <select_nodes /> <select_fencing /> </select> </alert> </alerts> </configuration>'
#
# Controlling the fence attributes:
# - hana_fence_action: allowed settings are terminate (default), fence or reboot
# - hana_fence_timeout: default is 120 seconds, only applies to 'fence' or 'reboot' actions
#
# Examples:
# crm_attribute --name=hana_fence_action --update=reboot
# crm_attribute --name=hana_fence_timeout --update=300

logger_tag="SAPHanaSR-alert-fencing"
logger="/usr/bin/logger"

# ON_FAIL_ACTION="${OCF_RESKEY_ON_FAIL_ACTION:-proceed}"
CRM_alert_recipient="${CRM_alert_recipient:-/dev/null}"
crm_alert_kind="${CRM_alert_kind:-manual call}"
crm_alert_node="${CRM_alert_node:-$HOSTNAME}"
crm_alert_desc="${CRM_alert_desc:-no description provided}"

cache_file="/run/crm/SAPHanaSR_site_cache"

alert_uptime_threshold="${alert_uptime_threshold:-300}"

IFS=. read -r sys_uptime REST </proc/uptime
$logger -t "$logger_tag" "AH: begin event '$crm_alert_kind' (uptime=$sys_uptime, alert_uptime_threshold=$alert_uptime_threshold)"


function process_fencing()
{
	# SAPHanaSR_site_cache has format (each line) host:site_name
	# figure out fenced site

    if [[ "$sys_uptime" -ge "$alert_uptime_threshold" ]]; then
        if [[ -e "$cache_file" ]]; then
            fenced_site_name=$(awk -F: '$1 == host { print $2 }' host="${crm_alert_node}" "$cache_file")
            local_site_name=$(awk -F: '$1 == host { print $2 }' host="${HOSTNAME}" "$cache_file")
            $logger  -t "$logger_tag" "INFO: cache_file=$cache_file, crm_alert_node=$crm_alert_node"
            $logger  -t "$logger_tag" "INFO: fenced_site_name=$fenced_site_name, local_site_name=$local_site_name"
            if [[ "$local_site_name" != "" && "$fenced_site_name" == "$local_site_name" ]]; then
                $logger  -t "$logger_tag" "DEC: FENCE ($fenced_site_name == $local_site_name)"
                sleep 10
                # Check if hana_fence_action is set, if not use 'terminate'.
                hana_fence_action="$(/usr/sbin/crm_attribute -n hana_fence_action -G -d terminate -q)"
                case "${hana_fence_action}" in
                    terminate)
                        /usr/sbin/crm_attribute -t status -N "${HOSTNAME}" -n terminate -v true
                        rc="$?"
                        if [[ "$rc" != "0" ]]; then
                            $logger  -t "$logger_tag" "ACT: /usr/sbin/crm_attribute -t status -N \"${HOSTNAME}\" -n terminate -v true; rc=$?"
                        fi
                        ;;
                    fence | reboot)
                        hana_fence_timeout="$(/usr/sbin/crm_attribute -n hana_fence_timeout -G -d 120 -q)"
                        /usr/sbin/stonith_admin --"${hana_fence_action}"="${HOSTNAME}" --timeout="${hana_fence_timeout}"
                        rc="$?"
                        if [[ "$rc" != "0" ]]; then
                            $logger  -t "$logger_tag" "ACT:  /usr/sbin/stonith_admin --${hana_fence_action}=\"${HOSTNAME}\" --timeout=${hana_fence_timeout}; rc=$?"
                        fi
                        ;;
                     *)
                        $logger -t "$logger_tag" "ACT: unknown fence-action \"$hana_fence_action\""
                        ;;
                esac
            else
                $logger  -t "$logger_tag" "DEC: NO FENCE ($fenced_site_name != $local_site_name)"
            fi
        else
            $logger  -t "$logger_tag" "DEC: NO FENCE (no cache)"
        fi
    else
        $logger  -t "$logger_tag" "DEC: NO FENCE (uptime < alert_uptime_threshold)"
    fi
}

function check_fencing()
{
	# SAPHanaSR_site_cache has format (each line) host:site_name
	# figure out fenced site

	if [[ -e "$cache_file" ]]; then
		fenced_site_name=$(awk -F: '$1 == host { print $2 }' host="${crm_alert_node}" "$cache_file")
		local_site_name=$(awk -F: '$1 == host { print $2 }' host="${HOSTNAME}" "$cache_file")
        	$logger -t "$logger_tag" "INFO: cache_file=$cache_file"
        	$logger -t "$logger_tag" "INFO: fenced_site_name=$fenced_site_name, local_site_name=$local_site_name"
		if [[ "$local_site_name" != "" && "$fenced_site_name" == "$local_site_name" ]]; then
			$logger -t "$logger_tag" "DEC: FENCE ($fenced_site_name == $local_site_name)"
		else
			$logger -t "$logger_tag" "DEC: NO FENCE ($fenced_site_name != $local_site_name)"
		fi
	else
		$logger -t "$logger_tag" "DEC: NO FENCE (no cache)"
	fi
}

case "$crm_alert_kind" in
    node|nodes)
	msg="Node '${crm_alert_node}' is now '${crm_alert_desc}'"
        $logger -t "$logger_tag" "INFO: $msg"
        ;;
    fencing)
        msg="Fencing for '${crm_alert_node}': ${crm_alert_desc}"
        $logger -t "$logger_tag" "INFO: $msg"
        process_fencing
        ;;
    check)
        msg="Checking for '${crm_alert_node}': ${crm_alert_desc}"
        $logger -t "$logger_tag" "INFO: $msg"
        check_fencing
        ;;
    *)
	msg="Unhandled '$crm_alert_kind' alert (${crm_alert_desc})"
        $logger -t "$logger_tag" "INFO: $msg"
        ;;
esac
$logger -t "$logger_tag" "AH: end event '$crm_alert_kind'"
#
