#!/usr/bin/bash
#
# Copyright 2015 Red Hat, Inc.
#
# Description:  Manages evacuation of nodes running nova-compute
#
# Authors: Andrew Beekhof
#
# Support:      openstack@lists.openstack.org
# License:      Apache Software License (ASL) 2.0
#


#######################################################################
# Initialization:

###
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
###

: ${__OCF_ACTION=$1}

#######################################################################

meta_data() {
    cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="NovaEvacuate" version="1.0">
<version>1.0</version>

<longdesc lang="en">
Facility for tacking a list of compute nodes and reliably evacuating the ones that fence_evacuate has flagged.
</longdesc>
<shortdesc lang="en">Evacuator for OpenStack Nova Compute Server</shortdesc>

<parameters>

<parameter name="auth_url" unique="0" required="1">
<longdesc lang="en">
Authorization URL for connecting to keystone in admin context
</longdesc>
<shortdesc lang="en">Authorization URL</shortdesc>
<content type="string" default="" />
</parameter>

<parameter name="username" unique="0" required="1">
<longdesc lang="en">
Username for connecting to keystone in admin context
</longdesc>
<shortdesc lang="en">Username</shortdesc>
<content type="string" default="" />
</parameter>

<parameter name="password" unique="0" required="1">
<longdesc lang="en">
Password for connecting to keystone in admin context
</longdesc>
<shortdesc lang="en">Password</shortdesc>
<content type="string" default="" />
</parameter>

<parameter name="tenant_name" unique="0" required="1">
<longdesc lang="en">
Tenant(Project) name for connecting to keystone in admin context.
Note that with Keystone V3 tenant names are only unique within a domain.
</longdesc>
<shortdesc lang="en">Tenant name</shortdesc>
<content type="string" default="" />
</parameter>

<parameter name="user_domain" unique="0" required="0">
<longdesc lang="en">
Keystone domain the user belongs to
</longdesc>
<shortdesc lang="en">Keystone v3 User Domain</shortdesc>
<content type="string" default="" />
</parameter>

<parameter name="project_domain" unique="0" required="0">
<longdesc lang="en">
Keystone domain the tenant(project) belongs to
</longdesc>
<shortdesc lang="en">Keystone v3 Project Domain</shortdesc>
<content type="string" default="" />
</parameter>

<parameter name="domain" unique="0" required="0">
<longdesc lang="en">
DNS domain in which hosts live, useful when the cluster uses short names and nova uses FQDN
</longdesc>
<shortdesc lang="en">DNS domain</shortdesc>
<content type="string" default="" />
</parameter>

<parameter name="endpoint_type" unique="0" required="0">
<longdesc lang="en">
Nova API location (internal, public or admin URL)
</longdesc>
<shortdesc lang="en">Nova API location (internal, public or admin URL)</shortdesc>
<content type="string" default="" />
</parameter>

<parameter name="region_name" unique="0" required="0">
<longdesc lang="en">
Region name for connecting to nova.
</longdesc>
<shortdesc lang="en">Region name</shortdesc>
<content type="string" default="" />
</parameter>

<parameter name="insecure" unique="0" required="0">
<longdesc lang="en">
Explicitly allow client to perform "insecure" TLS (https) requests.
The server's certificate will not be verified against any certificate authorities.
This option should be used with caution.
</longdesc>
<shortdesc lang="en">Allow insecure TLS requests</shortdesc>
<content type="boolean" default="0" />
</parameter>

<parameter name="no_shared_storage" unique="0" required="0">
<longdesc lang="en">
Indicate that nova storage for instances is not shared across compute
nodes. This must match the reality of how nova storage is configured!
Otherwise VMs could end up in error state upon evacuation. When
storage is non-shared, instances on dead hypervisors will be rebuilt
from their original image or volume, so anything on ephemeral storage
will be lost.
</longdesc>
<shortdesc lang="en">Disable shared storage recovery for instances</shortdesc>
<content type="boolean" default="0" />
</parameter>

<parameter name="verbose" unique="0" required="0">
<longdesc lang="en">
Enable extra logging from the evacuation process
</longdesc>
<shortdesc lang="en">Enable debug logging</shortdesc>
<content type="boolean" default="0" />
</parameter>

<parameter name="evacuate_delay" unique="0" required="0">
<longdesc lang="en">
Allows delaying the nova evacuate API call, e.g. to give a storage array time to clean
up eventual locks/leases.
</longdesc>
<shortdesc lang="en">Nova evacuate delay</shortdesc>
<content type="integer" default="0" />
</parameter>

</parameters>

<actions>
<action name="start"        timeout="20" />
<action name="stop"         timeout="20" />
<action name="monitor"      timeout="600" interval="10" depth="0"/>
<action name="validate-all" timeout="20" />
<action name="meta-data"    timeout="5" />
</actions>
</resource-agent>
END
}

#######################################################################

# don't exit on TERM, to test that lrmd makes sure that we do exit
trap sigterm_handler TERM
sigterm_handler() {
    ocf_log info "They use TERM to bring us down. No such luck."
    return
}

evacuate_usage() {
    cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}

Expects to have a fully populated OCF RA-compliant environment set.
END
}

evacuate_stop() {
    rm -f "$statefile"
    return $OCF_SUCCESS
}

evacuate_start() {
    touch "$statefile"
    # Do not invole monitor here so that the start timeout can be low
    return $?
}

update_evacuation() {
    attrd_updater -p -n evacuate -Q -N ${1} -U ${2}
    arc=$?
    if [ ${arc} != 0 ]; then
        ocf_log warn "Can not set evacuation state of ${1} to ${2}: ${arc}"
    fi
    return ${arc}
}

handle_evacuations() {
    while [ $# -gt 0 ]; do
        node=$1
        state=$2
        shift; shift;
        need_evacuate=0

        case $state in
            "")
                ;;
            no)
                ocf_log debug "$node is either fine or already handled"
                ;;
            yes) need_evacuate=1
                ;;
            *@*)
                where=$(echo $state | awk -F@ '{print $1}')
                when=$(echo $state | awk -F@ '{print $2}')
                now=$(date +%s)

                if [ $(($now - $when)) -gt 60 ]; then
                    ocf_log info "Processing partial evacuation of $node by" \
                        "$where at $when"
                    need_evacuate=1
                else
                    # Give some time for any in-flight evacuations to either
                    # complete or fail Nova won't react well if there are two
                    # overlapping requests
                    ocf_log info "Deferring processing partial evacuation of" \
                        "$node by $where at $when"
                fi
                ;;
        esac

        if [ $need_evacuate = 1 ]; then
            fence_agent="fence_compute"

            if have_binary fence_evacuate; then
                fence_agent="fence_evacuate"
            fi

            if [ ${OCF_RESKEY_evacuate_delay} != 0 ]; then
                ocf_log info "Delaying nova evacuate by $OCF_RESKEY_evacuate_delay seconds"
                sleep ${OCF_RESKEY_evacuate_delay}
            fi

            ocf_log notice "Initiating evacuation of $node with $fence_agent"
            $fence_agent ${fence_options} -o status -n ${node}
            if [ $? = 1 ]; then
                ocf_log info "Nova does not know about ${node}"
                # Dont mark as no because perhaps nova is unavailable right now
                continue
            fi

            update_evacuation ${node} "$(uname -n)@$(date +%s)"
            if [ $? != 0 ]; then
                return $OCF_SUCCESS
            fi

            $fence_agent ${fence_options} -o off -n $node
            rc=$?

            if [ $rc = 0 ]; then
                update_evacuation ${node} no
                ocf_log notice "Completed evacuation of $node"
            else
                ocf_log warn "Evacuation of $node failed: $rc"
                update_evacuation ${node} yes
            fi
        fi
    done

    return $OCF_SUCCESS
}

evacuate_monitor() {
    if [ ! -f "$statefile" ]; then
        return $OCF_NOT_RUNNING
    fi

    handle_evacuations $(
        attrd_updater -n evacuate -A \
            2> >(grep -v "attribute does not exist" 1>&2) |
            sed 's/ value=""/ value="no"/' |
            tr '="' '  ' |
            awk '{print $4" "$6}'
    )
    return $OCF_SUCCESS
}

evacuate_validate() {
    rc=$OCF_SUCCESS
    fence_options=""

    if ! have_binary fence_evacuate; then
       check_binary fence_compute
    fi

    # Is the state directory writable?
    state_dir=$(dirname $statefile)
    touch "$state_dir/$$"
    if [ $? != 0 ]; then
        ocf_exit_reason "Invalid state directory: $state_dir"
        return $OCF_ERR_ARGS
    fi
    rm -f "$state_dir/$$"

    if [ -z "${OCF_RESKEY_auth_url}" ]; then
        ocf_exit_reason "auth_url not configured"
        exit $OCF_ERR_CONFIGURED
    fi

    fence_options="${fence_options} -k ${OCF_RESKEY_auth_url}"

    if [ -z "${OCF_RESKEY_username}" ]; then
        ocf_exit_reason "username not configured"
        exit $OCF_ERR_CONFIGURED
    fi

    fence_options="${fence_options} -l ${OCF_RESKEY_username}"

    if [ -z "${OCF_RESKEY_password}" ]; then
        ocf_exit_reason "password not configured"
        exit $OCF_ERR_CONFIGURED
    fi

    fence_options="${fence_options} -p ${OCF_RESKEY_password}"

    if [ -z "${OCF_RESKEY_tenant_name}" ]; then
        ocf_exit_reason "tenant_name not configured"
        exit $OCF_ERR_CONFIGURED
    fi

    fence_options="${fence_options} -t ${OCF_RESKEY_tenant_name}"

    if [ -n "${OCF_RESKEY_user_domain}" ]; then
        fence_options="${fence_options} -u ${OCF_RESKEY_user_domain}"
    fi

    if [ -n "${OCF_RESKEY_project_domain}" ]; then
        fence_options="${fence_options} -P ${OCF_RESKEY_project_domain}"
    fi

    if [ -n "${OCF_RESKEY_domain}" ]; then
        fence_options="${fence_options} -d ${OCF_RESKEY_domain}"
    fi

    if [ -n "${OCF_RESKEY_region_name}" ]; then
        fence_options="${fence_options} \
            --region-name ${OCF_RESKEY_region_name}"
    fi

    if [ -n "${OCF_RESKEY_insecure}" ]; then
        if ocf_is_true "${OCF_RESKEY_insecure}"; then
            fence_options="${fence_options} --insecure"
        fi
    fi

    if [ -n "${OCF_RESKEY_no_shared_storage}" ]; then
        if ocf_is_true "${OCF_RESKEY_no_shared_storage}"; then
            fence_options="${fence_options} --no-shared-storage"
        fi
    fi

    if [ -n "${OCF_RESKEY_verbose}" ]; then
        if ocf_is_true "${OCF_RESKEY_verbose}"; then
            fence_options="${fence_options} --verbose"
        fi
    fi

    if [ -n "${OCF_RESKEY_endpoint_type}" ]; then
        case ${OCF_RESKEY_endpoint_type} in
            adminURL|publicURL|internalURL)
                ;;
            *)
                ocf_exit_reason "endpoint_type ${OCF_RESKEY_endpoint_type}" \
                    "not valid. Use adminURL or publicURL or internalURL"
                exit $OCF_ERR_CONFIGURED
                ;;
        esac
        fence_options="${fence_options} -e ${OCF_RESKEY_endpoint_type}"
    fi

    if [ $rc != $OCF_SUCCESS ]; then
        exit $rc
    fi
    return $rc
}

statefile="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.active"

case $__OCF_ACTION in
    start)
        evacuate_validate
        evacuate_start
        ;;
    stop)
        evacuate_stop
        ;;
    monitor)
        evacuate_validate
        evacuate_monitor
        ;;
    meta-data)
        meta_data
        exit $OCF_SUCCESS
        ;;
    usage|help)
        evacuate_usage
        exit $OCF_SUCCESS
        ;;
    validate-all)
        exit $OCF_SUCCESS
        ;;
    *)
        evacuate_usage
        exit $OCF_ERR_UNIMPLEMENTED
        ;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc
