#!/usr/libexec/platform-python -tt

#
# Requires the googleapiclient and oauth2client
# RHEL 7.x: google-api-python-client==1.6.7 python-gflags==2.0 pyasn1==0.4.8 rsa==3.4.2 pysocks==1.7.1 httplib2==0.19.0
# RHEL 8.x: pysocks==1.7.1 httplib2==0.19.0
# SLES 12.x: python-google-api-python-client python-oauth2client python-oauth2client-gce pysocks==1.7.1 httplib2==0.19.0
# SLES 15.x: python3-google-api-python-client python3-oauth2client pysocks==1.7.1 httplib2==0.19.0
#

import atexit
import logging
import json
import re
import os
import socket
import sys
import time

if sys.version_info >= (3, 0):
  # Python 3 imports.
  import urllib.parse as urlparse
  import urllib.request as urlrequest
else:
  # Python 2 imports.
  import urllib as urlparse
  import urllib2 as urlrequest
sys.path.append("/usr/share/fence")

from fencing import fail_usage, run_delay, all_opt, atexit_handler, check_input, process_input, show_docs, fence_action, run_command
try:
  sys.path.insert(0, '/usr/lib/fence-agents/bundled/google')
  import httplib2
  import googleapiclient.discovery
  import socks
  try:
    from google.oauth2.credentials import Credentials as GoogleCredentials
  except:
    from oauth2client.client import GoogleCredentials
except:
  pass

METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/'
METADATA_HEADERS = {'Metadata-Flavor': 'Google'}
INSTANCE_LINK = 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}/instances/{}'

def run_on_fail(options):
	if "--runonfail" in options:
		run_command(options, options["--runonfail"])

def fail_fence_agent(options, message):
	run_on_fail(options)
	fail_usage(message)

def raise_fence_agent(options, message):
	run_on_fail(options)
	raise Exception(message)

#
# Will use baremetalsolution setting or the environment variable
# FENCE_GCE_URI_REPLACEMENTS to replace the uri for calls to *.googleapis.com.
#
def replace_api_uri(options, http_request):
	uri_replacements = []
	# put any env var replacements first, then baremetalsolution if in options
	if "FENCE_GCE_URI_REPLACEMENTS" in os.environ:
		logging.debug("FENCE_GCE_URI_REPLACEMENTS environment variable exists")
		env_uri_replacements = os.environ["FENCE_GCE_URI_REPLACEMENTS"]
		try:
			uri_replacements_json = json.loads(env_uri_replacements)
			if isinstance(uri_replacements_json, list):
				uri_replacements = uri_replacements_json
			else:
				logging.warning("FENCE_GCE_URI_REPLACEMENTS exists, but is not a JSON List")
		except ValueError as e:
			logging.warning("FENCE_GCE_URI_REPLACEMENTS exists but is not valid JSON")
	if "--baremetalsolution" in options:
		uri_replacements.append(
			{
				"matchlength": 4,
				"match": "https://compute.googleapis.com/compute/v1/projects/(.*)/zones/(.*)/instances/(.*)/reset(.*)",
				"replace": "https://baremetalsolution.googleapis.com/v1/projects/\\1/locations/\\2/instances/\\3:resetInstance\\4"
			})
	for uri_replacement in uri_replacements:
		# each uri_replacement should have matchlength, match, and replace
		if "matchlength" not in uri_replacement or "match" not in uri_replacement or "replace" not in uri_replacement:
			logging.warning("FENCE_GCE_URI_REPLACEMENTS missing matchlength, match, or replace in %s" % uri_replacement)
			continue
		match = re.match(uri_replacement["match"], http_request.uri)
		if match is None or len(match.groups()) != uri_replacement["matchlength"]:
			continue
		replaced_uri = re.sub(uri_replacement["match"], uri_replacement["replace"], http_request.uri)
		match = re.match("https:\/\/.*.googleapis.com", replaced_uri)
		if match is None or match.start() != 0:
			logging.warning("FENCE_GCE_URI_REPLACEMENTS replace is not "
				"targeting googleapis.com, ignoring it: %s" % replaced_uri)
			continue
		logging.debug("Replacing googleapis uri %s with %s" % (http_request.uri, replaced_uri))
		http_request.uri = replaced_uri
		break
	return http_request

def retry_api_execute(options, http_request):
	replaced_http_request = replace_api_uri(options, http_request)
	retries = 3
	if options.get("--retries"):
		retries = int(options.get("--retries"))
	retry_sleep = 5
	if options.get("--retrysleep"):
		retry_sleep = int(options.get("--retrysleep"))
	retry = 0
	current_err = None
	while retry <= retries:
		if retry > 0:
			time.sleep(retry_sleep)
		try:
			return replaced_http_request.execute()
		except Exception as err:
			current_err = err
			logging.warning("Could not execute api call to: %s, retry: %s, "
				"err: %s" % (replaced_http_request.uri, retry, str(err)))
		retry += 1
	raise current_err


def translate_status(instance_status):
	"Returns on | off | unknown."
	if instance_status == "RUNNING":
		return "on"
	elif instance_status == "TERMINATED":
		return "off"
	return "unknown"


def get_nodes_list(conn, options):
	result = {}
	if "--zone" not in options:
		fail_fence_agent(options, "Failed: get_nodes_list: Please specify the --zone in the command")
	try:
		for zone in options["--zone"].split(","):
			instanceList = retry_api_execute(options, conn.instances().list(
				project=options["--project"],
				zone=zone))
			for instance in instanceList["items"]:
				result[instance["id"]] = (instance["name"], translate_status(instance["status"]))
	except Exception as err:
		fail_fence_agent(options, "Failed: get_nodes_list: {}".format(str(err)))

	return result


def get_power_status(conn, options):
	logging.debug("get_power_status")
	# if this is bare metal we need to just send back the opposite of the
	# requested action: if on send off, if off send on
	if "--baremetalsolution" in options:
		if options.get("--action") == "on":
			return "off"
		else:
			return "on"
	# If zone is not listed for an entry we attempt to get it automatically
	instance = options["--plug"]
	zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance]
	instance_status = get_instance_power_status(conn, options, instance, zone)
	# If any of the instances do not match the intended status we return the
	# the opposite status so that the fence agent can change it.
	if instance_status != options.get("--action"):
		return instance_status

	return options.get("--action")


def get_instance_power_status(conn, options, instance, zone):
	try:
		instance = retry_api_execute(
				options,
				conn.instances().get(project=options["--project"], zone=zone, instance=instance))
		return translate_status(instance["status"])
	except Exception as err:
		fail_fence_agent(options, "Failed: get_instance_power_status: {}".format(str(err)))


def check_for_existing_operation(conn, options, instance, zone, operation_type):
	logging.debug("check_for_existing_operation")
	if "--baremetalsolution" in options:
		# There is no API for checking in progress operations
		return False

	project = options["--project"]
	target_link = INSTANCE_LINK.format(project, zone, instance)
	query_filter = '(targetLink = "{}") AND (operationType = "{}") AND (status = "RUNNING")'.format(target_link, operation_type)
	result = retry_api_execute(
			options,
			conn.zoneOperations().list(project=project, zone=zone, filter=query_filter, maxResults=1))

	if "items" in result and result["items"]:
		logging.info("Existing %s operation found", operation_type)
		return result["items"][0]


def wait_for_operation(conn, options, zone, operation):
	if 'name' not in operation:
		logging.warning('Cannot wait for operation to complete, the'
		' requested operation will continue asynchronously')
		return False

	wait_time = 0
	project = options["--project"]
	while True:
		result = retry_api_execute(options, conn.zoneOperations().get(
			project=project,
			zone=zone,
			operation=operation['name']))
		if result['status'] == 'DONE':
			if 'error' in result:
				raise_fence_agent(options, result['error'])
			return True

		if "--errortimeout" in options and wait_time > int(options["--errortimeout"]):
			raise_fence_agent(options, "Operation did not complete before the timeout.")

		if "--warntimeout" in options and wait_time > int(options["--warntimeout"]):
			logging.warning("Operation did not complete before the timeout.")
			if "--runonwarn" in options:
				run_command(options, options["--runonwarn"])
			return False

		wait_time = wait_time + 1
		time.sleep(1)


def set_power_status(conn, options):
	logging.debug("set_power_status")
	instance = options["--plug"]
	# If zone is not listed for an entry we attempt to get it automatically
	zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance]
	set_instance_power_status(conn, options, instance, zone, options["--action"])


def set_instance_power_status(conn, options, instance, zone, action):
	logging.info("Setting power status of %s in zone %s", instance, zone)
	project = options["--project"]

	try:
		if action == "off":
			logging.info("Issuing poweroff of %s in zone %s", instance, zone)
			operation = check_for_existing_operation(conn, options, instance, zone, "stop")
			if operation and "--earlyexit" in options:
				return
			if not operation:
				operation = retry_api_execute(
						options,
						conn.instances().stop(project=project, zone=zone, instance=instance))
			logging.info("Poweroff command completed, waiting for the operation to complete")
			if wait_for_operation(conn, options, zone, operation):
				logging.info("Poweroff of %s in zone %s complete", instance, zone)
		elif action == "on":
			logging.info("Issuing poweron of %s in zone %s", instance, zone)
			operation = check_for_existing_operation(conn, options, instance, zone, "start")
			if operation and "--earlyexit" in options:
				return
			if not operation:
				operation = retry_api_execute(
						options,
						conn.instances().start(project=project, zone=zone, instance=instance))
			if wait_for_operation(conn, options, zone, operation):
				logging.info("Poweron of %s in zone %s complete", instance, zone)
	except Exception as err:
		fail_fence_agent(options, "Failed: set_instance_power_status: {}".format(str(err)))

def power_cycle(conn, options):
	logging.debug("power_cycle")
	instance = options["--plug"]
	# If zone is not listed for an entry we attempt to get it automatically
	zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance]
	return power_cycle_instance(conn, options, instance, zone)


def power_cycle_instance(conn, options, instance, zone):
	logging.info("Issuing reset of %s in zone %s", instance, zone)
	project = options["--project"]

	try:
		operation = check_for_existing_operation(conn, options, instance, zone, "reset")
		if operation and "--earlyexit" in options:
			return True
		if not operation:
			operation = retry_api_execute(
					options,
					conn.instances().reset(project=project, zone=zone, instance=instance))
		logging.info("Reset command sent, waiting for the operation to complete")
		if wait_for_operation(conn, options, zone, operation):
			logging.info("Reset of %s in zone %s complete", instance, zone)
		return True
	except Exception as err:
		logging.exception("Failed: power_cycle")
		raise err


def get_zone(conn, options, instance):
	logging.debug("get_zone");
	project = options['--project']
	fl = 'name="%s"' % instance
	request = replace_api_uri(options, conn.instances().aggregatedList(project=project, filter=fl))
	while request is not None:
		response = request.execute()
		zones = response.get('items', {})
		for zone in zones.values():
			for inst in zone.get('instances', []):
				if inst['name'] == instance:
					return inst['zone'].split("/")[-1]
		request = replace_api_uri(options, conn.instances().aggregatedList_next(
				previous_request=request, previous_response=response))
	raise_fence_agent(options, "Unable to find instance %s" % (instance))


def get_metadata(metadata_key, params=None, timeout=None):
	"""Performs a GET request with the metadata headers.

	Args:
		metadata_key: string, the metadata to perform a GET request on.
		params: dictionary, the query parameters in the GET request.
		timeout: int, timeout in seconds for metadata requests.

	Returns:
		HTTP response from the GET request.

	Raises:
		urlerror.HTTPError: raises when the GET request fails.
	"""
	logging.debug("get_metadata");
	timeout = timeout or 60
	metadata_url = os.path.join(METADATA_SERVER, metadata_key)
	params = urlparse.urlencode(params or {})
	url = '%s?%s' % (metadata_url, params)
	request = urlrequest.Request(url, headers=METADATA_HEADERS)
	request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({}))
	return request_opener.open(request, timeout=timeout * 1.1).read().decode("utf-8")


def define_new_opts():
	all_opt["zone"] = {
		"getopt" : ":",
		"longopt" : "zone",
		"help" : "--zone=[name]                  Zone, e.g. us-central1-b",
		"shortdesc" : "Zone.",
		"required" : "0",
		"order" : 2
	}
	all_opt["project"] = {
		"getopt" : ":",
		"longopt" : "project",
		"help" : "--project=[name]               Project ID",
		"shortdesc" : "Project ID.",
		"required" : "0",
		"order" : 3
	}
	all_opt["stackdriver-logging"] = {
		"getopt" : "",
		"longopt" : "stackdriver-logging",
		"help" : "--stackdriver-logging          Enable Logging to Stackdriver",
		"shortdesc" : "Stackdriver-logging support.",
		"longdesc" : "If enabled IP failover logs will be posted to stackdriver logging.",
		"required" : "0",
		"order" : 4
	}
	all_opt["baremetalsolution"] = {
		"getopt" : "",
		"longopt" : "baremetalsolution",
		"help" : "--baremetalsolution            Enable on bare metal",
		"shortdesc" : "If enabled this is a bare metal offering from google.",
		"required" : "0",
		"order" : 5
	}
	all_opt["apitimeout"] = {
		"getopt" : ":",
		"type" : "second",
		"longopt" : "apitimeout",
		"help" : "--apitimeout=[seconds]         Timeout to use for API calls",
		"shortdesc" : "Timeout in seconds to use for API calls, default is 60.",
		"required" : "0",
		"default" : 60,
		"order" : 6
	}
	all_opt["retries"] = {
		"getopt" : ":",
		"type" : "integer",
		"longopt" : "retries",
		"help" : "--retries=[retries]            Number of retries on failure for API calls",
		"shortdesc" : "Number of retries on failure for API calls, default is 3.",
		"required" : "0",
		"default" : 3,
		"order" : 7
	}
	all_opt["retrysleep"] = {
		"getopt" : ":",
		"type" : "second",
		"longopt" : "retrysleep",
		"help" : "--retrysleep=[seconds]         Time to sleep between API retries",
		"shortdesc" : "Time to sleep in seconds between API retries, default is 5.",
		"required" : "0",
		"default" : 5,
		"order" : 8
	}
	all_opt["serviceaccount"] = {
		"getopt" : ":",
		"longopt" : "serviceaccount",
		"help" : "--serviceaccount=[filename]    Service account json file location e.g. serviceaccount=/somedir/service_account.json",
		"shortdesc" : "Service Account to use for authentication to the google cloud APIs.",
		"required" : "0",
		"order" : 9
	}
	all_opt["plugzonemap"] = {
		"getopt" : ":",
		"longopt" : "plugzonemap",
		"help" : "--plugzonemap=[plugzonemap]    Comma separated zone map when fencing multiple plugs",
		"shortdesc" : "Comma separated zone map when fencing multiple plugs.",
		"required" : "0",
		"order" : 10
	}
	all_opt["proxyhost"] = {
		"getopt" : ":",
		"longopt" : "proxyhost",
		"help" : "--proxyhost=[proxy_host]       The proxy host to use, if one is needed to access the internet (Example: 10.122.0.33)",
		"shortdesc" : "If a proxy is used for internet access, the proxy host should be specified.",
		"required" : "0",
		"order" : 11
	}
	all_opt["proxyport"] = {
		"getopt" : ":",
		"type" : "integer",
		"longopt" : "proxyport",
		"help" : "--proxyport=[proxy_port]       The proxy port to use, if one is needed to access the internet (Example: 3127)",
		"shortdesc" : "If a proxy is used for internet access, the proxy port should be specified.",
		"required" : "0",
		"order" : 12
	}
	all_opt["earlyexit"] = {
		"getopt" : "",
		"longopt" : "earlyexit",
		"help" : "--earlyexit                    Return early if reset is already in progress",
		"shortdesc" : "If an existing reset operation is detected, the fence agent will return before the operation completes with a 0 return code.",
		"required" : "0",
		"order" : 13
	}
	all_opt["warntimeout"] = {
		"getopt" : ":",
		"type" : "second",
		"longopt" : "warntimeout",
		"help" : "--warntimeout=[warn_timeout]   Timeout seconds before logging a warning and returning a 0 status code",
		"shortdesc" : "If the operation is not completed within the timeout, the cluster operations are allowed to continue.",
		"required" : "0",
		"order" : 14
	}
	all_opt["errortimeout"] = {
		"getopt" : ":",
		"type" : "second",
		"longopt" : "errortimeout",
		"help" : "--errortimeout=[error_timeout] Timeout seconds before failing and returning a non-zero status code",
		"shortdesc" : "If the operation is not completed within the timeout, cluster is notified of the operation failure.",
		"required" : "0",
		"order" : 15
	}
	all_opt["runonwarn"] = {
		"getopt" : ":",
		"longopt" : "runonwarn",
		"help" : "--runonwarn=[run_on_warn]      If a timeout occurs and warning is generated, run the supplied command",
		"shortdesc" : "If a timeout would occur while running the agent, then the supplied command is run.",
		"required" : "0",
		"order" : 16
	}
	all_opt["runonfail"] = {
		"getopt" : ":",
		"longopt" : "runonfail",
		"help" : "--runonfail=[run_on_fail]      If a failure occurs, run the supplied command",
		"shortdesc" : "If a failure would occur while running the agent, then the supplied command is run.",
		"required" : "0",
		"order" : 17
	}


def main():
	conn = None

	device_opt = ["port", "no_password", "zone", "project", "stackdriver-logging",
		"method", "baremetalsolution", "apitimeout", "retries", "retrysleep",
		"serviceaccount", "plugzonemap", "proxyhost", "proxyport", "earlyexit",
		"warntimeout", "errortimeout", "runonwarn", "runonfail"]

	atexit.register(atexit_handler)

	define_new_opts()

	all_opt["power_timeout"]["default"] = "60"
	all_opt["method"]["default"] = "cycle"
	all_opt["method"]["help"] = "-m, --method=[method]          Method to fence (onoff|cycle) (Default: cycle)"

	options = check_input(device_opt, process_input(device_opt))

	docs = {}
	docs["shortdesc"] = "Fence agent for GCE (Google Cloud Engine)"
	docs["longdesc"] = "fence_gce is a Power Fencing agent for GCE (Google Cloud " \
			   "Engine). It uses the googleapiclient library to connect to GCE.\n" \
			   "googleapiclient can be configured with Google SDK CLI or by " \
			   "executing 'gcloud auth application-default login'.\n" \
			   "For instructions see: https://cloud.google.com/compute/docs/tutorials/python-guide"
	docs["vendorurl"] = "http://cloud.google.com"
	show_docs(options, docs)

	run_delay(options)

	# Prepare logging
	if options.get('--verbose') is None:
		logging.getLogger('googleapiclient').setLevel(logging.ERROR)
		logging.getLogger('oauth2client').setLevel(logging.ERROR)
	if options.get('--stackdriver-logging') is not None and options.get('--plug'):
		try:
			import google.cloud.logging.handlers
			client = google.cloud.logging.Client()
			handler = google.cloud.logging.handlers.CloudLoggingHandler(client, name=options['--plug'])
			handler.setLevel(logging.INFO)
			formatter = logging.Formatter('gcp:stonith "%(message)s"')
			handler.setFormatter(formatter)
			root_logger = logging.getLogger()
			if options.get('--verbose') is None:
				root_logger.setLevel(logging.INFO)
			root_logger.addHandler(handler)
		except ImportError:
			logging.error('Couldn\'t import google.cloud.logging, '
				'disabling Stackdriver-logging support')

  # if apitimeout is defined we set the socket timeout, if not we keep the
  # socket default which is 60s
	if options.get("--apitimeout"):
		socket.setdefaulttimeout(options["--apitimeout"])

	# Prepare cli
	try:
		serviceaccount = options.get("--serviceaccount")
		if serviceaccount:
			scope = ['https://www.googleapis.com/auth/cloud-platform']
			logging.debug("using credentials from service account")
			try:
				from google.oauth2.service_account import Credentials as ServiceAccountCredentials
				credentials = ServiceAccountCredentials.from_service_account_file(filename=serviceaccount, scopes=scope)
			except ImportError:
				from oauth2client.service_account import ServiceAccountCredentials
				credentials = ServiceAccountCredentials.from_json_keyfile_name(serviceaccount, scope)
		else:
			try:
				from googleapiclient import _auth
				credentials = _auth.default_credentials();
			except:
				credentials = GoogleCredentials.get_application_default()
			logging.debug("using application default credentials")

		if options.get("--proxyhost") and options.get("--proxyport"):
			proxy_info = httplib2.ProxyInfo(
				proxy_type=socks.PROXY_TYPE_HTTP,
				proxy_host=options.get("--proxyhost"),
				proxy_port=int(options.get("--proxyport")))
			http = credentials.authorize(httplib2.Http(proxy_info=proxy_info))
			conn = googleapiclient.discovery.build(
				'compute', 'v1', http=http, cache_discovery=False)
		else:
			conn = googleapiclient.discovery.build(
				'compute', 'v1', credentials=credentials, cache_discovery=False)
	except Exception as err:
		fail_fence_agent(options, "Failed: Create GCE compute v1 connection: {}".format(str(err)))

	# Get project and zone
	if not options.get("--project"):
		try:
			options["--project"] = get_metadata('project/project-id')
		except Exception as err:
			fail_fence_agent(options, "Failed retrieving GCE project. Please provide --project option: {}".format(str(err)))

	if "--baremetalsolution" in options:
		options["--zone"] = "none"

	# Populates zone automatically if missing from the command
	zones = [] if not "--zone" in options else options["--zone"].split(",")
	options["--plugzonemap"] = {}
	if "--plug" in options:
		for i, instance in enumerate(options["--plug"].split(",")):
			if len(zones) == 1:
				# If only one zone is specified, use it across all plugs
				options["--plugzonemap"][instance] = zones[0]
				continue

			if len(zones) - 1 >= i:
				# If we have enough zones specified with the --zone flag use the zone at
				# the same index as the plug
				options["--plugzonemap"][instance] = zones[i]
				continue

			try:
				# In this case we do not have a zone specified so we attempt to detect it
				options["--plugzonemap"][instance] = get_zone(conn, options, instance)
			except Exception as err:
				fail_fence_agent(options, "Failed retrieving GCE zone. Please provide --zone option: {}".format(str(err)))

	# Operate the fencing device
	result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list, power_cycle)
	sys.exit(result)

if __name__ == "__main__":
	main()
