#
# ovirt-hosted-engine-ha -- ovirt hosted engine high availability
# Copyright (C) 2013 Red Hat, Inc.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#

import ConfigParser
import errno
import json
import logging
import os
import re
import socket
import subprocess
import time

import sanlock

from . import constants
from ..env import config
from ..env import path as env_path
from ..lib import brokerlink
from ..lib import exceptions as ex
from ..lib import log_filter
from ..lib import metadata
from ..lib import util
from ..lib import vds_client as vdsc
from .state_machine import EngineStateMachine


class MetadataTooNewError(Exception):
    """
    This exception is raised when the parser determines that the
    metadata version is too new to handle.
    """
    pass


def handler_cleanup(f):
    """
    Call a cleanup function when transitioning out of a state
    (i.e. when the handler returns a state other than its own)
    """
    def cleanup_wrapper(self):
        ret = f(self)
        if ret[0] != self._rinfo['current-state']:
            cleanup_fn = f.__name__ + '_cleanup'
            getattr(self, cleanup_fn)()
        return ret
    return cleanup_wrapper


def float_or_none(v):
    if v == "None":
        return None
    else:
        return float(v)


def engine_status(status):
    if status != 'None':
        # Convert the json unicode strings back to ascii:
        # it makes the output and logs much easier to read
        try:
            return dict([(str(k), str(v)) for (k, v)
                        in json.loads(status).iteritems()])
        except ValueError:
            return {"vm": "unknown", "health": "unknown",
                    "detail": "serialization error"}
    else:
        return None


class HostedEngine(object):
    LF_MD_ERROR = 'LF_MD_ERROR'
    LF_MD_ERROR_INT = 900
    LF_ENGINE_HEALTH = 'LF_ENGINE_HEALTH'
    LF_ENGINE_HEALTH_INT = 60
    LF_GLOBAL_MD_ERROR = 'LF_GLOBAL_MD_ERROR'
    LF_GLOBAL_MD_ERROR_INT = 900
    LF_MAINTENANCE = 'LF_MAINTENANCE'
    LF_MAINTENANCE_INT = 900

    engine_status_score_lookup = {
        'None': 0,
        'vm-down': 1,
        'vm-up bad-health-status': 2,
        'vm-up good-health-status': 3,
    }

    class States(object):
        ENTRY = 'ENTRY'
        OFF = 'OFF'
        START = 'START'
        ON = 'ON'
        STOP = 'STOP'
        MIGRATE_START = 'MIGRATE_START'
        MIGRATE_MONITOR = 'MIGRATE_MONITOR'
        MAINTENANCE = 'MAINTENANCE'

    class MigrationStatus(object):
        STARTED = 'STARTED'
        IN_PROGRESS = 'IN_PROGRESS'
        DONE = 'DONE'
        FAILURE = 'FAILURE'

    class DomainMonitorStatus(object):
        NONE = 'NONE'
        PENDING = 'PENDING'
        ACQUIRED = 'ACQUIRED'

    class MaintenanceMode(object):
        NONE = 'NONE'
        GLOBAL = 'GLOBAL'
        LOCAL = 'LOCAL'

    def __init__(self, shutdown_requested_callback):
        """
        Initialize hosted engine monitoring logic.  shutdown_requested_callback
        is a callback returning True/False depending on whether ha agent
        shutdown has been requested.
        """
        self._log = logging.getLogger("%s.HostedEngine" % __name__)
        self._log.addFilter(log_filter.IntermittentFilter())

        self._shutdown_requested_callback = shutdown_requested_callback
        self._config = config.Config()

        self._score_cfg = self._get_score_config()
        self._hostname = self._get_hostname()

        self._broker = None
        self._required_monitors = self._get_required_monitors()
        self._local_monitors = {}
        self.fsm = EngineStateMachine(self, self._log, actions={
            "START_VM": self._start_engine_vm,
            "MIGRATE": self._start_migration,
            "MONITOR_MIGRATION": self._monitor_migration,
            "STOP_VM": self._stop_engine_vm
        })

        self._sd_path = None
        self._metadata_path = None

        self._sanlock_initialized = False

    @property
    def score_config(self):
        return self._score_cfg

    @property
    def min_memory_threshold(self):
        return int(self._config.get(config.VM, config.MEM_SIZE))

    def _get_score_config(self):
        score = {
            'base-score': constants.BASE_SCORE,
            'gateway-score-penalty': constants.GATEWAY_SCORE_PENALTY,
            'mgmt-bridge-score-penalty': constants.MGMT_BRIDGE_SCORE_PENALTY,
            'free-memory-score-penalty': constants.FREE_MEMORY_SCORE_PENALTY,
            'cpu-load-score-penalty': constants.CPU_LOAD_SCORE_PENALTY,
            'engine-retry-score-penalty': constants.ENGINE_RETRY_SCORE_PENALTY,
            'cpu-load-penalty-min': constants.CPU_LOAD_PENALTY_MIN,
            'cpu-load-penalty-max': constants.CPU_LOAD_PENALTY_MAX,
        }
        float_keys = set((
            'cpu-load-penalty-min',
            'cpu-load-penalty-max'
        ))

        cfg = ConfigParser.SafeConfigParser()
        cfg.read(constants.AGENT_CONF_FILE)
        try:
            score.update(cfg.items('score'))
        except (ConfigParser.NoOptionError, ConfigParser.NoSectionError):
            pass

        # When these are used they're expected to be numeric types
        for k, v in score.iteritems():
            if k in float_keys:
                score[k] = float(v)
            else:
                score[k] = int(v)

        return score

    def _get_hostname(self):
        """
        Return the name this host should introduce itself as, which must
        match the Common Name in the certificate used by libvirt (usually
        the vdsm certificate).
        """
        cmd = ['openssl', 'x509',
               '-in', constants.VDSM_CERT_FILE,
               '-noout', '-subject']
        self._log.debug("Executing: {0}".format(' '.join(cmd)))
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        output = p.communicate()

        if p.returncode != 0:
            self._log.info("Certificate not available (%s),"
                           " using hostname to identify host", output[1])
            return socket.gethostname()

        self._log.debug("Certificate subject: %s", output[0])
        res = re.findall(r'/CN=([A-Za-z0-9-_\.]+)', output[0])

        if len(res) and len(res[0]):
            self._log.info("Found certificate common name: %s", res[0])
            return res[0]
        else:
            self._log.info("Certificate common name not found,"
                           " using hostname to identify host")
            return socket.gethostname()

    def _get_required_monitors(self):
        """
        Called by __init__(), see self._required_monitors

        For each entry:
         'field' - field name in the _local_monitors dict, holding details:
                    'id' - id of started submonitor (or None if not started)
                    'status' - last status returned by this monitor
         'monitor' - monitor type, e.g. ping or cpu-load
         'options' - dict of options needed by this monitor
         'type'    - optional function that converts the value from string
                     to some better type
        """
        req = []
        req.append({
            'field': 'gateway',
            'monitor': 'ping',
            'type': bool,
            'options': {
                'addr': self._config.get(config.ENGINE, config.GATEWAY_ADDR)}
        })
        req.append({
            'field': 'bridge',
            'monitor': 'mgmt-bridge',
            'type': bool,
            'options': {
                'address': '0',
                'use_ssl': self._config.get(config.ENGINE, config.VDSM_SSL),
                'bridge_name': self._config.get(
                    config.ENGINE, config.BRIDGE_NAME
                )}
        })
        req.append({
            'field': 'mem-free',
            'monitor': 'mem-free',
            'type': float_or_none,
            'options': {
                'address': '0',
                'use_ssl': self._config.get(config.ENGINE, config.VDSM_SSL)}
        })
        req.append({
            'field': 'cpu-load',
            'monitor': 'cpu-load-no-engine',
            'type': float_or_none,
            'options': {
                'address': '0',
                'use_ssl': self._config.get(config.ENGINE, config.VDSM_SSL),
                'vm_uuid': self._config.get(config.VM, config.VM_UUID)}
        })
        req.append({
            'field': 'engine-health',
            'monitor': 'engine-health',
            'type': engine_status,
            'options': {
                'address': '0',
                'use_ssl': self._config.get(config.ENGINE, config.VDSM_SSL),
                'vm_uuid': self._config.get(config.VM, config.VM_UUID)}
        })
        return req

    @property
    def host_id(self):
        return int(self._config.get(config.ENGINE, config.HOST_ID))

    def start_monitoring(self):
        error_count = 0

        # make sure everything is initialized
        self._initialize_broker()
        self._initialize_vdsm()
        self._initialize_sanlock()
        self._initialize_domain_monitor()

        for old_state, state, delay in self.fsm:
            if self._shutdown_requested_callback():
                break

            self._log.debug("Processing engine state %s", state)
            self._broker.notify(brokerlink.NotifyEvents.STATE_TRANSITION,
                                "%s-%s" % (old_state.__class__.__name__,
                                           state.__class__.__name__),
                                hostname=socket.gethostname())

            try:
                # make sure everything is still initialized
                self._initialize_broker()
                self._initialize_vdsm()
                self._initialize_sanlock()
                self._initialize_domain_monitor()

                # log state
                self._log.info("Current state %s (score: %d)",
                               state.__class__.__name__,
                               state.score(self._log))
                if state.data.best_score_host:
                    self._log.info("Best remote host %s (id: %d, score: %d)",
                                   state.data.best_score_host["hostname"],
                                   state.data.best_score_host["host-id"],
                                   state.data.best_score_host["score"])

                # publish the current state
                blocks = self._generate_local_blocks(state)
                self._push_to_storage(blocks)
            except Exception as e:
                self._log.warning("Error while monitoring engine: %s", str(e))
                if not (isinstance(e, ex.DisconnectionError) or
                        isinstance(e, ex.RequestError)):
                    self._log.warning("Unexpected error", exc_info=True)

                delay = max(delay, 60)
                error_count += 1
                log_level = logging.INFO

            else:
                error_count = 0  # All is well, reset the error counter
                log_level = logging.DEBUG

            if error_count >= constants.MAX_ERROR_COUNT:
                self._log.error("Shutting down the agent because of "
                                "%d failures in a row!",
                                constants.MAX_ERROR_COUNT)
                break

            self._log.log(log_level, "Sleeping %d seconds", delay)
            time.sleep(delay)

        self._log.debug("Disconnecting from ha-broker")
        if self._broker and self._broker.is_connected():
            self._broker.disconnect()

    def _initialize_broker(self):
        if self._broker and self._broker.is_connected():
            return
        self._log.info("Initializing ha-broker connection")
        if not self._broker:
            self._broker = brokerlink.BrokerLink()
        try:
            self._broker.connect(constants.BROKER_CONNECTION_RETRIES)
        except Exception as e:
            self._log.error("Failed to connect to ha-broker: %s", str(e))
            raise

        for m in self._required_monitors:
            try:
                lm = {}
                lm['id'] = self._broker.start_monitor(m['monitor'],
                                                      m.get('options', {}))
                lm['type'] = m['type'] if 'type' in m else None
            except ex.RequestError:
                self._log.error("Failed to start necessary monitors")
                # Stopping monitors will occur automatically upon disconnection
                self._broker.disconnect()
                self._broker = None
                raise
            else:
                self._local_monitors[m['field']] = lm
        self._log.info("Broker initialized, all submonitors started")

    def _initialize_vdsm(self):
        # TODO not the most efficient means to maintain vdsmd...
        self._cond_start_service('vdsmd')

        self._log.debug("Verifying storage is attached")
        tries = 0
        while tries < constants.MAX_VDSM_WAIT_SECS:
            tries += 1
            # `hosted-engine --connect-storage` internally calls vdsClient's
            # connectStorageServer command, which can be executed repeatedly
            # without issue even if the storage is already connected.  Note
            # that if vdsm was just started, it might take a few seconds to
            # initialize before accepting commands, thus the retries.
            cmd = [constants.HOSTED_ENGINE_BINARY, '--connect-storage']
            self._log.debug("Executing {0}".format(cmd))
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            output = p.communicate()
            self._log.debug("Attempt %d, return code: %d", tries, p.returncode)
            self._log.debug("stdout: %s", output[0])
            self._log.debug("stderr: %s", output[1])
            if p.returncode == 0:
                self._log.debug("Successfully verified that VDSM"
                                " is attached to storage")
                break
        if tries == constants.MAX_VDSM_WAIT_SECS:
            self._log.error("Failed trying to connect storage: %s", output[1])
            raise Exception("Failed trying to connect storage")

        # Update to the current mount path for the domain
        self._sd_path = env_path.get_domain_path(self._config)
        self._log.debug("Path to storage domain is %s", self._sd_path)

    def _cond_start_service(self, service_name):
        self._log.debug("Checking %s status", service_name)
        with open(os.devnull, "w") as devnull:
            p = subprocess.Popen(['sudo',
                                  'service', service_name, 'status'],
                                 stdout=devnull, stderr=devnull)
            if p.wait() == 0:
                self._log.debug("%s running", service_name)
            else:
                self._log.info("Starting %s", service_name)
                with open(os.devnull, "w") as devnull:
                    p = subprocess.Popen(['sudo',
                                          'service', service_name, 'start'],
                                         stdout=devnull,
                                         stderr=subprocess.PIPE)
                    res = p.communicate()
                if p.returncode != 0:
                    raise Exception("Could not start {0}: {1}"
                                    .format(service_name, res[1]))

    def _initialize_sanlock(self):
        self._cond_start_service('sanlock')

        self._metadata_dir = env_path.get_metadata_path(self._config)
        lease_file = os.path.join(self._metadata_dir,
                                  constants.SERVICE_TYPE + '.lockspace')
        if not self._sanlock_initialized:
            lvl = logging.INFO
        else:
            lvl = logging.DEBUG
        self._log.log(lvl, "Ensuring lease for lockspace %s, host id %d"
                           " is acquired (file: %s)",
                           constants.LOCKSPACE_NAME, self.host_id, lease_file)

        try:
            sanlock.add_lockspace(constants.LOCKSPACE_NAME,
                                  self.host_id, lease_file)
        except sanlock.SanlockException as e:
            acquired_lock = False
            msg = None
            if hasattr(e, 'errno'):
                if e.errno == errno.EEXIST:
                    self._log.debug("Host already holds lock")
                    acquired_lock = True
                elif e.errno == errno.EINVAL:
                    msg = ("cannot get lock on host id {0}:"
                           " host already holds lock on a different host id"
                           .format(self.host_id))
                elif e.errno == errno.EINTR:
                    msg = ("cannot get lock on host id {0}:"
                           " sanlock operation interrupted (will retry)"
                           .format(self.host_id))
                elif e.errno == errno.EINPROGRESS:
                    msg = ("cannot get lock on host id {0}:"
                           " sanlock operation in progress (will retry)"
                           .format(self.host_id))
            if not acquired_lock:
                if not msg:
                    msg = ("cannot get lock on host id {0}: {1}"
                           .format(self.host_id, str(e)))
                self._log.error(msg, exc_info=True)
                raise Exception("Failed to initialize sanlock: {0}"
                                .format(msg))
        else:
            self._log.info("Acquired lock on host id %d", self.host_id)
        self._sanlock_initialized = True

    def _initialize_domain_monitor(self):
        use_ssl = util.to_bool(self._config.get(config.ENGINE,
                                                config.VDSM_SSL))
        sd_uuid = self._config.get(config.ENGINE, config.SD_UUID)
        host_id = self.host_id

        status = self._get_domain_monitor_status()
        if status == self.DomainMonitorStatus.NONE:
            try:
                vdsc.run_vds_client_cmd(
                    '0',
                    use_ssl,
                    'startMonitoringDomain',
                    sd_uuid,
                    host_id,
                )
            except Exception as e:
                msg = ("Failed to start monitoring domain"
                       " (sd_uuid={0}, host_id={1}): {2}"
                       .format(sd_uuid, host_id, str(e)))
                self._log.error(msg, exc_info=True)
                raise Exception(msg)
            else:
                self._log.info("Started VDSM domain monitor for %s", sd_uuid)
                status = self._get_domain_monitor_status()

        waited = 0
        while status != self.DomainMonitorStatus.ACQUIRED \
                and waited <= constants.MAX_DOMAIN_MONITOR_WAIT_SECS:
            waited += 5
            time.sleep(5)
            status = self._get_domain_monitor_status()

        if status == self.DomainMonitorStatus.ACQUIRED:
            self._log.debug("VDSM is monitoring domain %s", sd_uuid)
        else:
            msg = ("Failed to start monitoring domain"
                   " (sd_uuid={0}, host_id={1}): {2}"
                   .format(sd_uuid, host_id,
                           "timeout during domain acquisition"))
            self._log.error(msg, exc_info=True)
            raise Exception(msg)

    def _get_domain_monitor_status(self):
        use_ssl = util.to_bool(self._config.get(config.ENGINE,
                                                config.VDSM_SSL))
        sd_uuid = self._config.get(config.ENGINE, config.SD_UUID)

        try:
            repo_stats = vdsc.run_vds_client_cmd(
                '0',
                use_ssl,
                'repoStats'
            )
        except Exception as e:
            msg = ("Failed to get VDSM domain monitor status: {0}"
                   .format(str(e)))
            self._log.error(msg, exc_info=True)
            raise Exception(msg)

        if sd_uuid not in repo_stats:
            status = self.DomainMonitorStatus.NONE
            log_level = logging.INFO
        elif repo_stats[sd_uuid]['acquired']:
            status = self.DomainMonitorStatus.ACQUIRED
            log_level = logging.DEBUG
        else:
            status = self.DomainMonitorStatus.PENDING
            log_level = logging.INFO

        self._log.log(log_level, "VDSM domain monitor status: %s", status)
        return status

    def _generate_local_blocks(self, state):
        """
        This method places the current state and score on shared storage
        in the following format:

          {md_parse_vers}|{md_feature_vers}|{ts_int}
            |{host_id}|{score}|{engine_status}|{name}

        The compiled block is read back from the storage by other hosts,
        parsed from the string above, and used in the state machine logic.
        Most importantly to determine where the engine should be running.
        """
        score = state.score(self.fsm.logger)
        lm = state.data.stats.local
        md = state.metadata()
        data = ("{md_parse_vers}|{md_feature_vers}|{ts_int}"
                "|{host_id}|{score}|{engine_status}|{name}|{maintenance}"
                .format(md_parse_vers=constants.METADATA_PARSE_VERSION,
                        md_feature_vers=constants.METADATA_FEATURE_VERSION,
                        ts_int=state.data.stats.collect_start,
                        host_id=state.data.stats.host_id,
                        score=score,
                        engine_status=lm['engine-health'],
                        name=self._hostname,
                        maintenance=1 if md["maintenance"] else 0))
        if len(data) > constants.METADATA_BLOCK_BYTES:
            raise Exception("Output metadata too long ({0} bytes)"
                            .format(data))

        info = ("metadata_parse_version={md_parse_vers}\n"
                "metadata_feature_version={md_feature_vers}\n"
                "timestamp={ts_int} ({ts_str})\n"
                "host-id={host_id}\n"
                "score={score}\n"
                .format(md_parse_vers=constants.METADATA_PARSE_VERSION,
                        md_feature_vers=constants.METADATA_FEATURE_VERSION,
                        ts_int=state.data.stats.collect_start,
                        ts_str=time.ctime(state.data.stats.collect_start),
                        host_id=state.data.host_id,
                        score=score))
        for (k, v) in sorted(md.iteritems()):
            info += "{0}={1}\n".format(k, str(v))

        info_count = int((len(info) + constants.METADATA_BLOCK_BYTES - 1)
                         / constants.METADATA_BLOCK_BYTES)
        self._log.debug("Generated %d blocks:\n%s\n<\\0 padding>\n%s",
                        info_count + 1, data, info)
        data = data.ljust(constants.METADATA_BLOCK_BYTES, '\0')
        info = info.ljust(constants.METADATA_BLOCK_BYTES * info_count, '\0')
        out = data + info
        return out

    def _push_to_storage(self, blocks):
        self._broker.put_stats_on_storage(
            self._metadata_dir,
            constants.SERVICE_TYPE,
            self._config.get(config.ENGINE, config.HOST_ID),
            blocks)

    def collect_stats(self):
        all_stats = self._broker.get_stats_from_storage(
            self._metadata_dir,
            constants.SERVICE_TYPE)

        data = {
            # Flag is set if the local agent discovers metadata too new for it
            # to parse, in which case the agent will shut down the engine VM.
            "metadata_too_new": False,

            # Global metadata
            "cluster": {},

            # Id of this host just to make sure
            "host_id": self.host_id,

            # Metadata for remote hosts
            "hosts": {},

            # Local data
            "local": {},

            # Maintenance information
            "maintenance": False,
        }

        all_stats = self._broker.get_stats_from_storage(
            self._metadata_dir,
            constants.SERVICE_TYPE)

        # host_id 0 is a special case, representing global metadata
        if all_stats and 0 in all_stats:
            data["cluster"] = self.process_global_metadata(all_stats.pop(0))

        # collect the last reported state for all hosts
        for host_id, remote_data in all_stats.iteritems():
            try:
                # we are not interested in stale data about local
                # machine
                if host_id == self.host_id:
                    continue
                stats = self.process_remote_metadata(host_id, remote_data)
                data["hosts"][host_id] = stats
            except MetadataTooNewError:
                data["metadata_too_new"] = True

        # collect all local stats
        self._log.debug("Refreshing all submonitors")
        for field, monitor in self._local_monitors.iteritems():
            ret = self._broker.get_monitor_status(monitor['id'])
            if monitor['type'] is not None:
                ret = monitor['type'](ret)
            data["local"][field] = ret

        # check local maintenance
        data["local"]["maintenance"] = util.to_bool(self._config.get(
            config.HA,
            config.LOCAL_MAINTENANCE))

        self._log.debug("Refresh complete")

        return data

    def process_remote_metadata(self, host_id, data):
        try:
            md = metadata.parse_metadata_to_dict(host_id, data)
            # Make sure the Id database is consistent
            assert md["host-id"] == host_id
        except ex.FatalMetadataError as e:
            self._log.error(
                str(e),
                extra=log_filter.lf_args(self.LF_MD_ERROR + str(host_id),
                                         self.LF_MD_ERROR_INT))
            raise MetadataTooNewError()
        except ex.MetadataError as e:
            self._log.error(
                str(e),
                extra=log_filter.lf_args(self.LF_MD_ERROR + str(host_id),
                                         self.LF_MD_ERROR_INT))
            return {}
        except AssertionError as e:
            # Ignore host if the Id is not consistent
            self._log.error(
                str(e),
                extra=log_filter.lf_args(self.LF_MD_ERROR + str(host_id),
                                         self.LF_MD_ERROR_INT))
            return {}
        else:
            md['engine-status'] = engine_status(md["engine-status"])
            return md

    def process_global_metadata(self, data):
        md = {}
        if data is not None:
            try:
                md = metadata.parse_global_metadata_to_dict(self._log, data)
            except ex.MetadataError as e:
                self._log.error(
                    str(e),
                    extra=log_filter.lf_args(self.LF_GLOBAL_MD_ERROR,
                                             self.LF_GLOBAL_MD_ERROR_INT))
                # Continue agent processing, ignoring the bad global metadata
        return md

    def _start_migration(self, host_id, hostname):
        vm_id = self._config.get(config.VM, config.VM_UUID)
        use_ssl = util.to_bool(self._config.get(config.ENGINE,
                                                config.VDSM_SSL))
        try:
            self._log.debug("Initiating online migration of"
                            " vm %s from localhost to %s",
                            vm_id, hostname)
            vdsc.run_vds_client_cmd(
                '0',
                use_ssl,
                'migrate',
                vmId=vm_id,
                method='online',
                src='localhost',
                dst=hostname,
            )
            return True
        except Exception:
            self._log.error("Migration to host %s (id %d) failed to start",
                            hostname,
                            host_id,
                            exc_info=True)
            return False

    def _monitor_migration(self):
        vm_id = self._config.get(config.VM, config.VM_UUID)
        use_ssl = util.to_bool(self._config.get(config.ENGINE,
                                                config.VDSM_SSL))
        try:
            self._log.debug("Monitoring migration of vm %s", vm_id)
            res = vdsc.run_vds_client_cmd(
                '0',
                use_ssl,
                'migrateStatus',
                vm_id,
            )
        except Exception:
            # Log the exception; the failure is handled below
            self._log.error("Failed to migrate", exc_info=True)
            return False
        else:
            return res["status"]["message"]

    def _start_engine_vm(self):
        try:
            # Ensure there isn't any stale VDSM state from a prior VM lifecycle
            self._clean_vdsm_state()

            self._log.info("Starting vm using `%s --vm-start`",
                           constants.HOSTED_ENGINE_BINARY)
            p = subprocess.Popen([constants.HOSTED_ENGINE_BINARY,
                                  '--vm-start'],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            output = p.communicate()
            self._log.info("stdout: %s", output[0])
            self._log.info("stderr: %s", output[1])
            if p.returncode != 0:
                # FIXME consider removing, we can get vm status from sanlock,
                # if still an issue then the alternative tracking the time we
                # started the engine might be better than parsing this output
                if output[0].startswith("Virtual machine already exists"):
                    self._log.warning("Failed to start engine VM,"
                                      " already running according to VDSM")
                    return

                self._log.error("Failed: %s", output[1])
                raise Exception(output[1])

            self._log.error("Engine VM started on localhost")
            return True
        except Exception as e:
            self._log.error("Failed to start engine VM: %s", str(e))
            return False

    def _clean_vdsm_state(self):
        """
        Query VDSM for stats on hosted engine VM, and if there are stats for
        the VM but the VM is not running, attempt to clear them using the
        VDSM 'destroy' verb.  If after 10 tries the state is present, raise
        an exception indicating the error.
        """
        self._log.info("Ensuring VDSM state is clear for engine VM")
        vm_id = self._config.get(config.VM, config.VM_UUID)
        use_ssl = util.to_bool(self._config.get(config.ENGINE,
                                                config.VDSM_SSL))

        for i in range(0, 10):
            # Loop until state is clear or until timeout
            try:
                stats = vdsc.run_vds_client_cmd('0', use_ssl,
                                                'getVmStats', vm_id)
            except ex.DetailedError as e:
                if e.detail == "Virtual machine does not exist":
                    self._log.info("Vdsm state for VM clean")
                    return
                else:
                    raise

            vm_status = stats['statsList'][0]['status'].lower()
            if vm_status == 'powering up' or vm_status == 'up':
                self._log.info("VM is running on host")
                return

            self._log.info("Cleaning state for non-running VM")
            try:
                vdsc.run_vds_client_cmd('0', use_ssl, 'destroy', vm_id)
            except ex.DetailedError as e:
                if e.detail == "Virtual machine does not exist":
                    self._log.info("Vdsm state for VM clean")
                    return
                else:
                    raise
            time.sleep(1)

        raise Exception("Timed out trying to clean VDSM state for VM")

    def _stop_engine_vm(self, force=False):
        try:
            cmd = '--vm-poweroff' if force else '--vm-shutdown'
            self._log.info("Shutting down vm using `%s %s`",
                           constants.HOSTED_ENGINE_BINARY, cmd)
            p = subprocess.Popen([constants.HOSTED_ENGINE_BINARY, cmd],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            output = p.communicate()
            self._log.info("stdout: %s", output[0])
            self._log.info("stderr: %s", output[1])
            if (p.returncode != 0
                    and not output[0].startswith(
                    "Virtual machine does not exist")):
                self._log.error("Failed to stop engine vm with %s %s: %s",
                                constants.HOSTED_ENGINE_BINARY, cmd, output[1])
                raise Exception(output[1])

            self._log.error("Engine VM stopped on localhost")
            return True
        except Exception as e:
            self._log.error("Failed to stop engine VM: %s", str(e))
            return False
