Skip to content

Instantly share code, notes, and snippets.

@VictorRodriguez
Created September 20, 2019 15:52
Show Gist options
  • Save VictorRodriguez/e137a8cd87cf821f8076e9acc02ce195 to your computer and use it in GitHub Desktop.
Save VictorRodriguez/e137a8cd87cf821f8076e9acc02ce195 to your computer and use it in GitHub Desktop.
Detect crashed VMs
From 38af2845cc01fa9cab54f3cf66c750d3612e7afc Mon Sep 17 00:00:00 2001
From: Mario Alfredo Carrillo Arevalo <mario.alfredo.c.arevalo@intel.com>
Date: Fri, 16 Aug 2019 19:16:13 +0000
Subject: [PATCH] Detect crashed VMs
This patch is the porting of another patch shared by WindRiver.
It was ported for:
- Repo: https://github.com/starlingx-staging/stx-nova.git
- Branch: stx/stein.2
The objective of this patch is to detect when a VM has crashed.
Signed-off-by: Mario Alfredo Carrillo Arevalo <mario.alfredo.c.arevalo@intel.com>
---
nova/compute/manager.py | 73 ++++++++++++++++++++++-------
nova/notifications/base.py | 10 ++++
nova/tests/unit/compute/test_compute_mgr.py | 38 +++++++++++++--
nova/tests/unit/test_notifications.py | 11 ++++-
nova/tests/unit/virt/libvirt/fakelibvirt.py | 16 +++++++
nova/tests/unit/virt/libvirt/test_host.py | 45 ++++++++++++++++++
nova/virt/event.py | 11 ++++-
nova/virt/libvirt/host.py | 12 +++++
8 files changed, 194 insertions(+), 22 deletions(-)
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 8421808..ac9d8e2 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1104,8 +1104,16 @@ class ComputeManager(manager.Manager):
vm_power_state = power_state.RUNNING
elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED:
vm_power_state = power_state.SUSPENDED
+ # WRS: add handling of crashed event
+ elif event.get_transition() == virtevent.EVENT_LIFECYCLE_CRASHED:
+ vm_power_state = power_state.CRASHED
else:
LOG.warning("Unexpected lifecycle event: %d", event_transition)
+ @utils.synchronized(instance.uuid)
+ def sync_instance_power_state():
+ self._sync_instance_power_state(context,
+ instance,
+ vm_power_state)
migrate_finish_statuses = {
# This happens on the source node and indicates live migration
@@ -1126,7 +1134,12 @@ class ComputeManager(manager.Manager):
# Note(lpetrut): The event may be delayed, thus not reflecting
# the current instance power state. In that case, ignore the event.
current_power_state = self._get_power_state(context, instance)
- if current_power_state == vm_power_state:
+ if (current_power_state == vm_power_state or
+ # WRS: We map hypervisor crashes to the CRASHED power state,
+ # but libvirt will put the guest in the SHUTDOWN state. See
+ # nova.virt.libvirt.host.Host._event_lifecycle_callback...
+ (vm_power_state == power_state.CRASHED and
+ current_power_state == power_state.SHUTDOWN)):
LOG.debug('Synchronizing instance power state after lifecycle '
'event "%(event)s"; current vm_state: %(vm_state)s, '
'current task_state: %(task_state)s, current DB '
@@ -1138,9 +1151,7 @@ class ComputeManager(manager.Manager):
'db_power_state': instance.power_state,
'vm_power_state': vm_power_state},
instance_uuid=instance.uuid)
- self._sync_instance_power_state(context,
- instance,
- vm_power_state)
+ sync_instance_power_state()
# The following checks are for live migration. We want to activate
# the port binding for the destination host before the live migration
@@ -2838,6 +2849,15 @@ class ComputeManager(manager.Manager):
@utils.synchronized(instance.uuid)
def do_stop_instance():
current_power_state = self._get_power_state(context, instance)
+ # WRS: Get the latest DB info to minimize race condition.
+ # If the instance is crashed, skip stopping and let VIM recovery
+ # code takes over.
+ instance.refresh()
+ if instance.power_state == power_state.CRASHED:
+ LOG.info('Skip stopping instance since instance crashed.',
+ instance_uuid=instance.uuid)
+ return
+
LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
'current task_state: %(task_state)s, current DB '
'power_state: %(db_power_state)s, current VM '
@@ -2883,6 +2903,13 @@ class ComputeManager(manager.Manager):
do_stop_instance()
+ # WRS: add handling of instance crash
+ def _request_recovery(self, context, instance):
+ """Instance crashed notification for recovery request."""
+ instance.save(expected_task_state=[None])
+ self._instance_update(context, instance,
+ power_state=power_state.CRASHED)
+
def _power_on(self, context, instance):
network_info = self.network_api.get_instance_nw_info(context, instance)
block_device_info = self._get_instance_block_device_info(context,
@@ -7963,6 +7990,9 @@ class ComputeManager(manager.Manager):
db_instance.refresh(use_slave=use_slave)
db_power_state = db_instance.power_state
vm_state = db_instance.vm_state
+ # WRS: skip periodic synv when VM Crashed event is being handled
+ if db_power_state == power_state.CRASHED:
+ return
if self.host != db_instance.host:
# on the sending end of nova-compute _sync_power_state
@@ -7980,16 +8010,22 @@ class ComputeManager(manager.Manager):
instance=db_instance)
return
elif db_instance.task_state is not None:
- # on the receiving end of nova-compute, it could happen
- # that the DB instance already report the new resident
- # but the actual VM has not showed up on the hypervisor
- # yet. In this case, let's allow the loop to continue
- # and run the state sync in a later round
- LOG.info("During sync_power_state the instance has a "
- "pending task (%(task)s). Skip.",
- {'task': db_instance.task_state},
- instance=db_instance)
- return
+ # WRS: if instance crashed reset task_state to allow recovery
+ if vm_state == vm_states.ACTIVE \
+ and vm_power_state == power_state.CRASHED:
+ db_instance.task_state = None
+ db_instance.save()
+ else:
+ # on the receiving end of nova-compute, it could happen
+ # that the DB instance already report the new resident
+ # but the actual VM has not showed up on the hypervisor
+ # yet. In this case, let's allow the loop to continue
+ # and run the state sync in a later round
+ LOG.info("During sync_power_state the instance has a "
+ "pending task (%(task)s). Skip.",
+ {'task': db_instance.task_state},
+ instance=db_instance)
+ return
orig_db_power_state = db_power_state
if vm_power_state != db_power_state:
@@ -8017,10 +8053,15 @@ class ComputeManager(manager.Manager):
pass
elif vm_state == vm_states.ACTIVE:
# The only rational power state should be RUNNING
- if vm_power_state in (power_state.SHUTDOWN,
- power_state.CRASHED):
+ # WRS: CRASHED is handled separately
+ if vm_power_state == power_state.SHUTDOWN:
self._stop_unexpected_shutdown_instance(
context, vm_state, db_instance, orig_db_power_state)
+ # WRS: add handling of crashed state
+ elif vm_power_state == power_state.CRASHED:
+ LOG.warning("Instance crashed. Let VIM recover it.(%s)",
+ db_instance.uuid)
+ self._request_recovery(context, db_instance)
elif vm_power_state == power_state.SUSPENDED:
LOG.warning("Instance is suspended unexpectedly. Calling "
"the stop API.", instance=db_instance)
diff --git a/nova/notifications/base.py b/nova/notifications/base.py
index 1a6f0ef..b29f7e3 100644
--- a/nova/notifications/base.py
+++ b/nova/notifications/base.py
@@ -13,6 +13,13 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
+#
+# Copyright (c) 2013-2017 Wind River Systems, Inc.
+#
+# The right to copy, distribute, modify, or otherwise make use
+# of this software may be licensed only pursuant to the terms
+# of an applicable Wind River license agreement.
+#
"""Functionality related to notifications common to multiple layers of
the system.
@@ -158,11 +165,14 @@ def _compute_states_payload(instance, old_vm_state=None,
if old_task_state is None:
old_task_state = instance["task_state"]
+ # WRS: add instance power_state to states_payload
+ power_state = instance["power_state"]
states_payload = {
"old_state": old_vm_state,
"state": new_vm_state,
"old_task_state": old_task_state,
"new_task_state": new_task_state,
+ "power_state": power_state,
}
return states_payload
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 3355962..937d08b 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -9,6 +9,13 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
+#
+# Copyright (c) 2013-2017 Wind River Systems, Inc.
+#
+# The right to copy, distribute, modify, or otherwise make use
+# of this software may be licensed only pursuant to the terms
+# of an applicable Wind River license agreement.
+#
"""Unit tests for ComputeManager()."""
@@ -151,6 +158,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
virtevent.EVENT_LIFECYCLE_RESUMED: power_state.RUNNING,
virtevent.EVENT_LIFECYCLE_SUSPENDED:
power_state.SUSPENDED,
+ # WRS: add crashed event
+ virtevent.EVENT_LIFECYCLE_CRASHED: power_state.CRASHED,
virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED:
power_state.PAUSED,
virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED:
@@ -874,7 +883,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
mock_host_get.assert_called_once_with(self.context, our_host,
expected_attrs=['info_cache', 'metadata'])
mock_init_virt.assert_called_once_with()
- mock_temp_mut.assert_called_once_with(self.context, read_deleted='yes')
+ mock_temp_mut.assert_called_with(self.context, read_deleted='yes')
mock_get_inst.assert_called_once_with(self.context)
mock_get_net.assert_called_once_with(self.context, deleted_instance)
@@ -1850,8 +1859,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
self.assertTrue(mock_save.called)
def test_sync_instance_power_state_to_stop(self):
- for ps in (power_state.SHUTDOWN, power_state.CRASHED,
- power_state.SUSPENDED):
+ # WRS: don't include crashed state - see added testcase
+ for ps in (power_state.SHUTDOWN, power_state.SUSPENDED):
self._test_sync_to_stop(power_state.RUNNING, vm_states.ACTIVE, ps)
for ps in (power_state.SHUTDOWN, power_state.CRASHED):
@@ -1861,6 +1870,25 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
self._test_sync_to_stop(power_state.SHUTDOWN, vm_states.STOPPED,
power_state.RUNNING, force=True)
+ # WRS: add testcase for crashed scenario. Detect call to _request_recovery
+ def test_sync_instance_power_state_crashed(self):
+ driver_power_state = power_state.CRASHED
+ instance = self._get_sync_instance(
+ power_state.RUNNING, vm_states.ACTIVE)
+
+ self.mox.StubOutWithMock(objects.Instance, 'refresh')
+ self.mox.StubOutWithMock(objects.Instance, 'save')
+ self.mox.StubOutWithMock(self.compute, '_request_recovery')
+
+ instance.refresh(use_slave=False)
+ instance.save()
+ self.compute._request_recovery(self.context, instance)
+ self.mox.ReplayAll()
+ self.compute._sync_instance_power_state(self.context, instance,
+ driver_power_state)
+ self.mox.VerifyAll()
+ self.mox.UnsetStubs()
+
def test_sync_instance_power_state_to_terminate(self):
self._test_sync_to_stop(power_state.RUNNING, vm_states.ACTIVE,
power_state.SHUTDOWN,
@@ -4279,6 +4307,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
self.context, vm_state=vm_states.ACTIVE,
task_state=None, power_state=power_state.SHUTDOWN)
+ @mock.patch.object(instance, 'refresh')
@mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.SHUTDOWN)
@mock.patch.object(compute_utils, 'notify_about_instance_action')
@@ -4286,11 +4315,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
@mock.patch.object(self.compute, '_power_off_instance')
@mock.patch.object(instance, 'save')
def do_test(save_mock, power_off_mock, notify_mock,
- notify_action_mock, get_state_mock):
+ notify_action_mock, get_state_mock, refresh_mock):
# run the code
self.compute.stop_instance(self.context, instance, True)
# assert the calls
self.assertEqual(2, get_state_mock.call_count)
+ refresh_mock.assert_called_once_with()
notify_mock.assert_has_calls([
mock.call(self.context, instance, 'power_off.start'),
mock.call(self.context, instance, 'power_off.end')
diff --git a/nova/tests/unit/test_notifications.py b/nova/tests/unit/test_notifications.py
index aa058ad..68064d6 100644
--- a/nova/tests/unit/test_notifications.py
+++ b/nova/tests/unit/test_notifications.py
@@ -12,6 +12,13 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
+#
+# Copyright (c) 2013-2017 Wind River Systems, Inc.
+#
+# The right to copy, distribute, modify, or otherwise make use
+# of this software may be licensed only pursuant to the terms
+# of an applicable Wind River license agreement.
+#
"""Tests for common notifications."""
@@ -536,8 +543,10 @@ class NotificationsTestCase(test.TestCase):
class NotificationsFormatTestCase(test.NoDBTestCase):
def test_state_computation(self):
+ # WRS: add power_state
instance = {'vm_state': mock.sentinel.vm_state,
- 'task_state': mock.sentinel.task_state}
+ 'task_state': mock.sentinel.task_state,
+ 'power_state': None}
states = notifications._compute_states_payload(instance)
self.assertEqual(mock.sentinel.vm_state, states['state'])
self.assertEqual(mock.sentinel.vm_state, states['old_state'])
diff --git a/nova/tests/unit/virt/libvirt/fakelibvirt.py b/nova/tests/unit/virt/libvirt/fakelibvirt.py
index 03a70fc..aaf35aa 100644
--- a/nova/tests/unit/virt/libvirt/fakelibvirt.py
+++ b/nova/tests/unit/virt/libvirt/fakelibvirt.py
@@ -11,6 +11,13 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
+#
+# Copyright (c) 2013-2017 Wind River Systems, Inc.
+#
+# The right to copy, distribute, modify, or otherwise make use
+# of this software may be licensed only pursuant to the terms
+# of an applicable Wind River license agreement.
+#
import sys
import textwrap
@@ -109,6 +116,15 @@ VIR_NODE_CPU_STATS_ALL_CPUS = -1
VIR_DOMAIN_START_PAUSED = 1
+# WRS: add virDomainEventStoppedDetailType enum definitions
+VIR_DOMAIN_EVENT_STOPPED_SHUTDOWN = 0
+VIR_DOMAIN_EVENT_STOPPED_DESTROYED = 1
+VIR_DOMAIN_EVENT_STOPPED_CRASHED = 2
+VIR_DOMAIN_EVENT_STOPPED_MIGRATED = 3
+VIR_DOMAIN_EVENT_STOPPED_SAVED = 4
+VIR_DOMAIN_EVENT_STOPPED_FAILED = 5
+VIR_DOMAIN_EVENT_STOPPED_FROM_SNAPSHOT = 6
+
# libvirtError enums
# (Intentionally different from what's in libvirt. We do this to check,
# that consumers of the library are using the symbolic names rather than
diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py
index e4f7bf1..e375aba 100644
--- a/nova/tests/unit/virt/libvirt/test_host.py
+++ b/nova/tests/unit/virt/libvirt/test_host.py
@@ -13,6 +13,13 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
+#
+# Copyright (c) 2016-2017 Wind River Systems, Inc.
+#
+# The right to copy, distribute, modify, or otherwise make use
+# of this software may be licensed only pursuant to the terms
+# of an applicable Wind River license agreement.
+#
import eventlet
from eventlet import greenthread
@@ -192,6 +199,44 @@ class HostTestCase(test.NoDBTestCase):
self.assertEqual(got_events[0].transition,
event.EVENT_LIFECYCLE_STOPPED)
+ # WRS: add testcase for crashed event
+ def test_event_lifecycle_crashed(self):
+ got_events = []
+
+ def handler(event):
+ got_events.append(event)
+
+ hostimpl = host.Host("qemu:///system",
+ lifecycle_event_handler=handler)
+
+ conn = hostimpl.get_connection()
+ hostimpl._init_events_pipe()
+
+ fake_dom_xml = """
+ <domain type='kvm'>
+ <uuid>cef19ce0-0ca2-11df-855d-b19fbce37686</uuid>
+ <devices>
+ <disk type='file'>
+ <source file='filename'/>
+ </disk>
+ </devices>
+ </domain>
+ """
+ dom = fakelibvirt.Domain(conn,
+ fake_dom_xml,
+ False)
+ hostimpl._event_lifecycle_callback(
+ conn, dom, fakelibvirt.VIR_DOMAIN_EVENT_STOPPED,
+ fakelibvirt.VIR_DOMAIN_EVENT_STOPPED_FAILED, hostimpl)
+ hostimpl._dispatch_events()
+
+ self.assertEqual(len(got_events), 1)
+ self.assertIsInstance(got_events[0], event.LifecycleEvent)
+ self.assertEqual(got_events[0].uuid,
+ "cef19ce0-0ca2-11df-855d-b19fbce37686")
+ self.assertEqual(got_events[0].transition,
+ event.EVENT_LIFECYCLE_CRASHED)
+
def test_event_lifecycle_callback_suspended_old_libvirt(self):
"""Tests the suspended lifecycle event with libvirt before post-copy
"""
diff --git a/nova/virt/event.py b/nova/virt/event.py
index 6e4e01e..6af0b9b 100644
--- a/nova/virt/event.py
+++ b/nova/virt/event.py
@@ -11,6 +11,13 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
+#
+# Copyright (c) 2013-2017 Wind River Systems, Inc.
+#
+# The right to copy, distribute, modify, or otherwise make use
+# of this software may be licensed only pursuant to the terms
+# of an applicable Wind River license agreement.
+#
"""
Asynchronous event notifications from virtualization drivers.
@@ -31,7 +38,8 @@ EVENT_LIFECYCLE_RESUMED = 3
EVENT_LIFECYCLE_SUSPENDED = 4
EVENT_LIFECYCLE_POSTCOPY_STARTED = 5
EVENT_LIFECYCLE_MIGRATION_COMPLETED = 6
-
+# WRS: add crashed event
+EVENT_LIFECYCLE_CRASHED = 7
NAMES = {
EVENT_LIFECYCLE_STARTED: _('Started'),
@@ -41,6 +49,7 @@ NAMES = {
EVENT_LIFECYCLE_SUSPENDED: _('Suspended'),
EVENT_LIFECYCLE_POSTCOPY_STARTED: _('Postcopy started'),
EVENT_LIFECYCLE_MIGRATION_COMPLETED: _('Migration completed'),
+ EVENT_LIFECYCLE_CRASHED: _('Crashed'),
}
diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py
index baf6434..8f87252 100644
--- a/nova/virt/libvirt/host.py
+++ b/nova/virt/libvirt/host.py
@@ -17,6 +17,13 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
+#
+# Copyright (c) 2013-2017 Wind River Systems, Inc.
+#
+# The right to copy, distribute, modify, or otherwise make use
+# of this software may be licensed only pursuant to the terms
+# of an applicable Wind River license agreement.
+#
"""
Manages information about the host OS and hypervisor.
@@ -169,6 +176,11 @@ class Host(object):
transition = None
if event == libvirt.VIR_DOMAIN_EVENT_STOPPED:
transition = virtevent.EVENT_LIFECYCLE_STOPPED
+ # WRS: transition to crashed if stop failed
+ if detail == libvirt.VIR_DOMAIN_EVENT_STOPPED_FAILED:
+ transition = virtevent.EVENT_LIFECYCLE_CRASHED
+ else:
+ transition = virtevent.EVENT_LIFECYCLE_STOPPED
elif event == libvirt.VIR_DOMAIN_EVENT_STARTED:
transition = virtevent.EVENT_LIFECYCLE_STARTED
elif event == libvirt.VIR_DOMAIN_EVENT_SUSPENDED:
--
2.7.4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment