Created
September 20, 2019 15:52
-
-
Save VictorRodriguez/e137a8cd87cf821f8076e9acc02ce195 to your computer and use it in GitHub Desktop.
Detect crashed VMs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 38af2845cc01fa9cab54f3cf66c750d3612e7afc Mon Sep 17 00:00:00 2001 | |
From: Mario Alfredo Carrillo Arevalo <mario.alfredo.c.arevalo@intel.com> | |
Date: Fri, 16 Aug 2019 19:16:13 +0000 | |
Subject: [PATCH] Detect crashed VMs | |
This patch is the porting of another patch shared by WindRiver. | |
It was ported for: | |
- Repo: https://github.com/starlingx-staging/stx-nova.git | |
- Branch: stx/stein.2 | |
The objective of this patch is to detect when a VM has crashed. | |
Signed-off-by: Mario Alfredo Carrillo Arevalo <mario.alfredo.c.arevalo@intel.com> | |
--- | |
nova/compute/manager.py | 73 ++++++++++++++++++++++------- | |
nova/notifications/base.py | 10 ++++ | |
nova/tests/unit/compute/test_compute_mgr.py | 38 +++++++++++++-- | |
nova/tests/unit/test_notifications.py | 11 ++++- | |
nova/tests/unit/virt/libvirt/fakelibvirt.py | 16 +++++++ | |
nova/tests/unit/virt/libvirt/test_host.py | 45 ++++++++++++++++++ | |
nova/virt/event.py | 11 ++++- | |
nova/virt/libvirt/host.py | 12 +++++ | |
8 files changed, 194 insertions(+), 22 deletions(-) | |
diff --git a/nova/compute/manager.py b/nova/compute/manager.py | |
index 8421808..ac9d8e2 100644 | |
--- a/nova/compute/manager.py | |
+++ b/nova/compute/manager.py | |
@@ -1104,8 +1104,16 @@ class ComputeManager(manager.Manager): | |
vm_power_state = power_state.RUNNING | |
elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED: | |
vm_power_state = power_state.SUSPENDED | |
+ # WRS: add handling of crashed event | |
+ elif event.get_transition() == virtevent.EVENT_LIFECYCLE_CRASHED: | |
+ vm_power_state = power_state.CRASHED | |
else: | |
LOG.warning("Unexpected lifecycle event: %d", event_transition) | |
+ @utils.synchronized(instance.uuid) | |
+ def sync_instance_power_state(): | |
+ self._sync_instance_power_state(context, | |
+ instance, | |
+ vm_power_state) | |
migrate_finish_statuses = { | |
# This happens on the source node and indicates live migration | |
@@ -1126,7 +1134,12 @@ class ComputeManager(manager.Manager): | |
# Note(lpetrut): The event may be delayed, thus not reflecting | |
# the current instance power state. In that case, ignore the event. | |
current_power_state = self._get_power_state(context, instance) | |
- if current_power_state == vm_power_state: | |
+ if (current_power_state == vm_power_state or | |
+ # WRS: We map hypervisor crashes to the CRASHED power state, | |
+ # but libvirt will put the guest in the SHUTDOWN state. See | |
+ # nova.virt.libvirt.host.Host._event_lifecycle_callback... | |
+ (vm_power_state == power_state.CRASHED and | |
+ current_power_state == power_state.SHUTDOWN)): | |
LOG.debug('Synchronizing instance power state after lifecycle ' | |
'event "%(event)s"; current vm_state: %(vm_state)s, ' | |
'current task_state: %(task_state)s, current DB ' | |
@@ -1138,9 +1151,7 @@ class ComputeManager(manager.Manager): | |
'db_power_state': instance.power_state, | |
'vm_power_state': vm_power_state}, | |
instance_uuid=instance.uuid) | |
- self._sync_instance_power_state(context, | |
- instance, | |
- vm_power_state) | |
+ sync_instance_power_state() | |
# The following checks are for live migration. We want to activate | |
# the port binding for the destination host before the live migration | |
@@ -2838,6 +2849,15 @@ class ComputeManager(manager.Manager): | |
@utils.synchronized(instance.uuid) | |
def do_stop_instance(): | |
current_power_state = self._get_power_state(context, instance) | |
+ # WRS: Get the latest DB info to minimize race condition. | |
+ # If the instance is crashed, skip stopping and let VIM recovery | |
+ # code takes over. | |
+ instance.refresh() | |
+ if instance.power_state == power_state.CRASHED: | |
+ LOG.info('Skip stopping instance since instance crashed.', | |
+ instance_uuid=instance.uuid) | |
+ return | |
+ | |
LOG.debug('Stopping instance; current vm_state: %(vm_state)s, ' | |
'current task_state: %(task_state)s, current DB ' | |
'power_state: %(db_power_state)s, current VM ' | |
@@ -2883,6 +2903,13 @@ class ComputeManager(manager.Manager): | |
do_stop_instance() | |
+ # WRS: add handling of instance crash | |
+ def _request_recovery(self, context, instance): | |
+ """Instance crashed notification for recovery request.""" | |
+ instance.save(expected_task_state=[None]) | |
+ self._instance_update(context, instance, | |
+ power_state=power_state.CRASHED) | |
+ | |
def _power_on(self, context, instance): | |
network_info = self.network_api.get_instance_nw_info(context, instance) | |
block_device_info = self._get_instance_block_device_info(context, | |
@@ -7963,6 +7990,9 @@ class ComputeManager(manager.Manager): | |
db_instance.refresh(use_slave=use_slave) | |
db_power_state = db_instance.power_state | |
vm_state = db_instance.vm_state | |
+ # WRS: skip periodic synv when VM Crashed event is being handled | |
+ if db_power_state == power_state.CRASHED: | |
+ return | |
if self.host != db_instance.host: | |
# on the sending end of nova-compute _sync_power_state | |
@@ -7980,16 +8010,22 @@ class ComputeManager(manager.Manager): | |
instance=db_instance) | |
return | |
elif db_instance.task_state is not None: | |
- # on the receiving end of nova-compute, it could happen | |
- # that the DB instance already report the new resident | |
- # but the actual VM has not showed up on the hypervisor | |
- # yet. In this case, let's allow the loop to continue | |
- # and run the state sync in a later round | |
- LOG.info("During sync_power_state the instance has a " | |
- "pending task (%(task)s). Skip.", | |
- {'task': db_instance.task_state}, | |
- instance=db_instance) | |
- return | |
+ # WRS: if instance crashed reset task_state to allow recovery | |
+ if vm_state == vm_states.ACTIVE \ | |
+ and vm_power_state == power_state.CRASHED: | |
+ db_instance.task_state = None | |
+ db_instance.save() | |
+ else: | |
+ # on the receiving end of nova-compute, it could happen | |
+ # that the DB instance already report the new resident | |
+ # but the actual VM has not showed up on the hypervisor | |
+ # yet. In this case, let's allow the loop to continue | |
+ # and run the state sync in a later round | |
+ LOG.info("During sync_power_state the instance has a " | |
+ "pending task (%(task)s). Skip.", | |
+ {'task': db_instance.task_state}, | |
+ instance=db_instance) | |
+ return | |
orig_db_power_state = db_power_state | |
if vm_power_state != db_power_state: | |
@@ -8017,10 +8053,15 @@ class ComputeManager(manager.Manager): | |
pass | |
elif vm_state == vm_states.ACTIVE: | |
# The only rational power state should be RUNNING | |
- if vm_power_state in (power_state.SHUTDOWN, | |
- power_state.CRASHED): | |
+ # WRS: CRASHED is handled separately | |
+ if vm_power_state == power_state.SHUTDOWN: | |
self._stop_unexpected_shutdown_instance( | |
context, vm_state, db_instance, orig_db_power_state) | |
+ # WRS: add handling of crashed state | |
+ elif vm_power_state == power_state.CRASHED: | |
+ LOG.warning("Instance crashed. Let VIM recover it.(%s)", | |
+ db_instance.uuid) | |
+ self._request_recovery(context, db_instance) | |
elif vm_power_state == power_state.SUSPENDED: | |
LOG.warning("Instance is suspended unexpectedly. Calling " | |
"the stop API.", instance=db_instance) | |
diff --git a/nova/notifications/base.py b/nova/notifications/base.py | |
index 1a6f0ef..b29f7e3 100644 | |
--- a/nova/notifications/base.py | |
+++ b/nova/notifications/base.py | |
@@ -13,6 +13,13 @@ | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
+# | |
+# Copyright (c) 2013-2017 Wind River Systems, Inc. | |
+# | |
+# The right to copy, distribute, modify, or otherwise make use | |
+# of this software may be licensed only pursuant to the terms | |
+# of an applicable Wind River license agreement. | |
+# | |
"""Functionality related to notifications common to multiple layers of | |
the system. | |
@@ -158,11 +165,14 @@ def _compute_states_payload(instance, old_vm_state=None, | |
if old_task_state is None: | |
old_task_state = instance["task_state"] | |
+ # WRS: add instance power_state to states_payload | |
+ power_state = instance["power_state"] | |
states_payload = { | |
"old_state": old_vm_state, | |
"state": new_vm_state, | |
"old_task_state": old_task_state, | |
"new_task_state": new_task_state, | |
+ "power_state": power_state, | |
} | |
return states_payload | |
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py | |
index 3355962..937d08b 100644 | |
--- a/nova/tests/unit/compute/test_compute_mgr.py | |
+++ b/nova/tests/unit/compute/test_compute_mgr.py | |
@@ -9,6 +9,13 @@ | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
+# | |
+# Copyright (c) 2013-2017 Wind River Systems, Inc. | |
+# | |
+# The right to copy, distribute, modify, or otherwise make use | |
+# of this software may be licensed only pursuant to the terms | |
+# of an applicable Wind River license agreement. | |
+# | |
"""Unit tests for ComputeManager().""" | |
@@ -151,6 +158,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, | |
virtevent.EVENT_LIFECYCLE_RESUMED: power_state.RUNNING, | |
virtevent.EVENT_LIFECYCLE_SUSPENDED: | |
power_state.SUSPENDED, | |
+ # WRS: add crashed event | |
+ virtevent.EVENT_LIFECYCLE_CRASHED: power_state.CRASHED, | |
virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: | |
power_state.PAUSED, | |
virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: | |
@@ -874,7 +883,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, | |
mock_host_get.assert_called_once_with(self.context, our_host, | |
expected_attrs=['info_cache', 'metadata']) | |
mock_init_virt.assert_called_once_with() | |
- mock_temp_mut.assert_called_once_with(self.context, read_deleted='yes') | |
+ mock_temp_mut.assert_called_with(self.context, read_deleted='yes') | |
mock_get_inst.assert_called_once_with(self.context) | |
mock_get_net.assert_called_once_with(self.context, deleted_instance) | |
@@ -1850,8 +1859,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, | |
self.assertTrue(mock_save.called) | |
def test_sync_instance_power_state_to_stop(self): | |
- for ps in (power_state.SHUTDOWN, power_state.CRASHED, | |
- power_state.SUSPENDED): | |
+ # WRS: don't include crashed state - see added testcase | |
+ for ps in (power_state.SHUTDOWN, power_state.SUSPENDED): | |
self._test_sync_to_stop(power_state.RUNNING, vm_states.ACTIVE, ps) | |
for ps in (power_state.SHUTDOWN, power_state.CRASHED): | |
@@ -1861,6 +1870,25 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, | |
self._test_sync_to_stop(power_state.SHUTDOWN, vm_states.STOPPED, | |
power_state.RUNNING, force=True) | |
+ # WRS: add testcase for crashed scenario. Detect call to _request_recovery | |
+ def test_sync_instance_power_state_crashed(self): | |
+ driver_power_state = power_state.CRASHED | |
+ instance = self._get_sync_instance( | |
+ power_state.RUNNING, vm_states.ACTIVE) | |
+ | |
+ self.mox.StubOutWithMock(objects.Instance, 'refresh') | |
+ self.mox.StubOutWithMock(objects.Instance, 'save') | |
+ self.mox.StubOutWithMock(self.compute, '_request_recovery') | |
+ | |
+ instance.refresh(use_slave=False) | |
+ instance.save() | |
+ self.compute._request_recovery(self.context, instance) | |
+ self.mox.ReplayAll() | |
+ self.compute._sync_instance_power_state(self.context, instance, | |
+ driver_power_state) | |
+ self.mox.VerifyAll() | |
+ self.mox.UnsetStubs() | |
+ | |
def test_sync_instance_power_state_to_terminate(self): | |
self._test_sync_to_stop(power_state.RUNNING, vm_states.ACTIVE, | |
power_state.SHUTDOWN, | |
@@ -4279,6 +4307,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, | |
self.context, vm_state=vm_states.ACTIVE, | |
task_state=None, power_state=power_state.SHUTDOWN) | |
+ @mock.patch.object(instance, 'refresh') | |
@mock.patch.object(self.compute, '_get_power_state', | |
return_value=power_state.SHUTDOWN) | |
@mock.patch.object(compute_utils, 'notify_about_instance_action') | |
@@ -4286,11 +4315,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, | |
@mock.patch.object(self.compute, '_power_off_instance') | |
@mock.patch.object(instance, 'save') | |
def do_test(save_mock, power_off_mock, notify_mock, | |
- notify_action_mock, get_state_mock): | |
+ notify_action_mock, get_state_mock, refresh_mock): | |
# run the code | |
self.compute.stop_instance(self.context, instance, True) | |
# assert the calls | |
self.assertEqual(2, get_state_mock.call_count) | |
+ refresh_mock.assert_called_once_with() | |
notify_mock.assert_has_calls([ | |
mock.call(self.context, instance, 'power_off.start'), | |
mock.call(self.context, instance, 'power_off.end') | |
diff --git a/nova/tests/unit/test_notifications.py b/nova/tests/unit/test_notifications.py | |
index aa058ad..68064d6 100644 | |
--- a/nova/tests/unit/test_notifications.py | |
+++ b/nova/tests/unit/test_notifications.py | |
@@ -12,6 +12,13 @@ | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
+# | |
+# Copyright (c) 2013-2017 Wind River Systems, Inc. | |
+# | |
+# The right to copy, distribute, modify, or otherwise make use | |
+# of this software may be licensed only pursuant to the terms | |
+# of an applicable Wind River license agreement. | |
+# | |
"""Tests for common notifications.""" | |
@@ -536,8 +543,10 @@ class NotificationsTestCase(test.TestCase): | |
class NotificationsFormatTestCase(test.NoDBTestCase): | |
def test_state_computation(self): | |
+ # WRS: add power_state | |
instance = {'vm_state': mock.sentinel.vm_state, | |
- 'task_state': mock.sentinel.task_state} | |
+ 'task_state': mock.sentinel.task_state, | |
+ 'power_state': None} | |
states = notifications._compute_states_payload(instance) | |
self.assertEqual(mock.sentinel.vm_state, states['state']) | |
self.assertEqual(mock.sentinel.vm_state, states['old_state']) | |
diff --git a/nova/tests/unit/virt/libvirt/fakelibvirt.py b/nova/tests/unit/virt/libvirt/fakelibvirt.py | |
index 03a70fc..aaf35aa 100644 | |
--- a/nova/tests/unit/virt/libvirt/fakelibvirt.py | |
+++ b/nova/tests/unit/virt/libvirt/fakelibvirt.py | |
@@ -11,6 +11,13 @@ | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
+# | |
+# Copyright (c) 2013-2017 Wind River Systems, Inc. | |
+# | |
+# The right to copy, distribute, modify, or otherwise make use | |
+# of this software may be licensed only pursuant to the terms | |
+# of an applicable Wind River license agreement. | |
+# | |
import sys | |
import textwrap | |
@@ -109,6 +116,15 @@ VIR_NODE_CPU_STATS_ALL_CPUS = -1 | |
VIR_DOMAIN_START_PAUSED = 1 | |
+# WRS: add virDomainEventStoppedDetailType enum definitions | |
+VIR_DOMAIN_EVENT_STOPPED_SHUTDOWN = 0 | |
+VIR_DOMAIN_EVENT_STOPPED_DESTROYED = 1 | |
+VIR_DOMAIN_EVENT_STOPPED_CRASHED = 2 | |
+VIR_DOMAIN_EVENT_STOPPED_MIGRATED = 3 | |
+VIR_DOMAIN_EVENT_STOPPED_SAVED = 4 | |
+VIR_DOMAIN_EVENT_STOPPED_FAILED = 5 | |
+VIR_DOMAIN_EVENT_STOPPED_FROM_SNAPSHOT = 6 | |
+ | |
# libvirtError enums | |
# (Intentionally different from what's in libvirt. We do this to check, | |
# that consumers of the library are using the symbolic names rather than | |
diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py | |
index e4f7bf1..e375aba 100644 | |
--- a/nova/tests/unit/virt/libvirt/test_host.py | |
+++ b/nova/tests/unit/virt/libvirt/test_host.py | |
@@ -13,6 +13,13 @@ | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
+# | |
+# Copyright (c) 2016-2017 Wind River Systems, Inc. | |
+# | |
+# The right to copy, distribute, modify, or otherwise make use | |
+# of this software may be licensed only pursuant to the terms | |
+# of an applicable Wind River license agreement. | |
+# | |
import eventlet | |
from eventlet import greenthread | |
@@ -192,6 +199,44 @@ class HostTestCase(test.NoDBTestCase): | |
self.assertEqual(got_events[0].transition, | |
event.EVENT_LIFECYCLE_STOPPED) | |
+ # WRS: add testcase for crashed event | |
+ def test_event_lifecycle_crashed(self): | |
+ got_events = [] | |
+ | |
+ def handler(event): | |
+ got_events.append(event) | |
+ | |
+ hostimpl = host.Host("qemu:///system", | |
+ lifecycle_event_handler=handler) | |
+ | |
+ conn = hostimpl.get_connection() | |
+ hostimpl._init_events_pipe() | |
+ | |
+ fake_dom_xml = """ | |
+ <domain type='kvm'> | |
+ <uuid>cef19ce0-0ca2-11df-855d-b19fbce37686</uuid> | |
+ <devices> | |
+ <disk type='file'> | |
+ <source file='filename'/> | |
+ </disk> | |
+ </devices> | |
+ </domain> | |
+ """ | |
+ dom = fakelibvirt.Domain(conn, | |
+ fake_dom_xml, | |
+ False) | |
+ hostimpl._event_lifecycle_callback( | |
+ conn, dom, fakelibvirt.VIR_DOMAIN_EVENT_STOPPED, | |
+ fakelibvirt.VIR_DOMAIN_EVENT_STOPPED_FAILED, hostimpl) | |
+ hostimpl._dispatch_events() | |
+ | |
+ self.assertEqual(len(got_events), 1) | |
+ self.assertIsInstance(got_events[0], event.LifecycleEvent) | |
+ self.assertEqual(got_events[0].uuid, | |
+ "cef19ce0-0ca2-11df-855d-b19fbce37686") | |
+ self.assertEqual(got_events[0].transition, | |
+ event.EVENT_LIFECYCLE_CRASHED) | |
+ | |
def test_event_lifecycle_callback_suspended_old_libvirt(self): | |
"""Tests the suspended lifecycle event with libvirt before post-copy | |
""" | |
diff --git a/nova/virt/event.py b/nova/virt/event.py | |
index 6e4e01e..6af0b9b 100644 | |
--- a/nova/virt/event.py | |
+++ b/nova/virt/event.py | |
@@ -11,6 +11,13 @@ | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
+# | |
+# Copyright (c) 2013-2017 Wind River Systems, Inc. | |
+# | |
+# The right to copy, distribute, modify, or otherwise make use | |
+# of this software may be licensed only pursuant to the terms | |
+# of an applicable Wind River license agreement. | |
+# | |
""" | |
Asynchronous event notifications from virtualization drivers. | |
@@ -31,7 +38,8 @@ EVENT_LIFECYCLE_RESUMED = 3 | |
EVENT_LIFECYCLE_SUSPENDED = 4 | |
EVENT_LIFECYCLE_POSTCOPY_STARTED = 5 | |
EVENT_LIFECYCLE_MIGRATION_COMPLETED = 6 | |
- | |
+# WRS: add crashed event | |
+EVENT_LIFECYCLE_CRASHED = 7 | |
NAMES = { | |
EVENT_LIFECYCLE_STARTED: _('Started'), | |
@@ -41,6 +49,7 @@ NAMES = { | |
EVENT_LIFECYCLE_SUSPENDED: _('Suspended'), | |
EVENT_LIFECYCLE_POSTCOPY_STARTED: _('Postcopy started'), | |
EVENT_LIFECYCLE_MIGRATION_COMPLETED: _('Migration completed'), | |
+ EVENT_LIFECYCLE_CRASHED: _('Crashed'), | |
} | |
diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py | |
index baf6434..8f87252 100644 | |
--- a/nova/virt/libvirt/host.py | |
+++ b/nova/virt/libvirt/host.py | |
@@ -17,6 +17,13 @@ | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
+# | |
+# Copyright (c) 2013-2017 Wind River Systems, Inc. | |
+# | |
+# The right to copy, distribute, modify, or otherwise make use | |
+# of this software may be licensed only pursuant to the terms | |
+# of an applicable Wind River license agreement. | |
+# | |
""" | |
Manages information about the host OS and hypervisor. | |
@@ -169,6 +176,11 @@ class Host(object): | |
transition = None | |
if event == libvirt.VIR_DOMAIN_EVENT_STOPPED: | |
transition = virtevent.EVENT_LIFECYCLE_STOPPED | |
+ # WRS: transition to crashed if stop failed | |
+ if detail == libvirt.VIR_DOMAIN_EVENT_STOPPED_FAILED: | |
+ transition = virtevent.EVENT_LIFECYCLE_CRASHED | |
+ else: | |
+ transition = virtevent.EVENT_LIFECYCLE_STOPPED | |
elif event == libvirt.VIR_DOMAIN_EVENT_STARTED: | |
transition = virtevent.EVENT_LIFECYCLE_STARTED | |
elif event == libvirt.VIR_DOMAIN_EVENT_SUSPENDED: | |
-- | |
2.7.4 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment