]> xenbits.xensource.com Git - osstest/openstack-nova.git/commitdiff
fix "down" nova-compute service spuriously marked as "up"
authorChris Friesen <chris.friesen@windriver.com>
Fri, 27 Mar 2015 15:23:48 +0000 (09:23 -0600)
committerChris Friesen <chris.friesen@windriver.com>
Wed, 10 Jun 2015 19:18:06 +0000 (13:18 -0600)
Currently we use the auto-updated "updated_at" field to determine
whether a service is "up".  An end-user can cause the "updated_at"
field to be updated by disabling or enabling the service, thus
potentially causing a service that is unavailable to be detected
as "up".  This could result in the scheduler trying to assign
instances to an unavailable compute node, or in the system
mistakenly preventing evacuation of an instance.

The fix is to add a new field to explicitly track the timestamp of
the last time the service sent in a status report and use that if
available when testing whether the service is up.

DocImpact
This commit will cause a behaviour change for the DB servicegroup
driver.  It will mean that enabling/disabling the service will
cause the "updated_at" field to change (as before) but that will
no longer be tied to the "up/down" status of the service. So
"nova service-list" could show the service as "down" even if it
shows a recent "updated_at".  (But this could happen for the other
servicegroup drivers already.)

Closes-Bug: #1420848
Change-Id: Ied7d47363d0489bca3cf2c711217e1a3b7d24a03

nova/db/sqlalchemy/api.py
nova/db/sqlalchemy/migrate_repo/versions/294_add_service_heartbeat.py [new file with mode: 0644]
nova/db/sqlalchemy/models.py
nova/objects/service.py
nova/servicegroup/drivers/db.py
nova/tests/unit/compute/test_resource_tracker.py
nova/tests/unit/db/test_migrations.py
nova/tests/unit/objects/test_objects.py
nova/tests/unit/objects/test_service.py
nova/tests/unit/servicegroup/test_db_servicegroup.py

index d24877389c61adb6c5d4671d84c7499bb0831da7..daf69ce9bea2bbff014bceba73722ed053dba777 100644 (file)
@@ -540,6 +540,12 @@ def service_update(context, service_id, values):
     session = get_session()
     with session.begin():
         service_ref = _service_get(context, service_id, session=session)
+        # Only servicegroup.drivers.db.DbDriver._report_state() updates
+        # 'report_count', so if that value changes then store the timestamp
+        # as the last time we got a state report.
+        if 'report_count' in values:
+            if values['report_count'] > service_ref.report_count:
+                service_ref.last_seen_up = timeutils.utcnow()
         service_ref.update(values)
 
     return service_ref
diff --git a/nova/db/sqlalchemy/migrate_repo/versions/294_add_service_heartbeat.py b/nova/db/sqlalchemy/migrate_repo/versions/294_add_service_heartbeat.py
new file mode 100644 (file)
index 0000000..20af7b0
--- /dev/null
@@ -0,0 +1,29 @@
+# Copyright (c) 2015 Wind River Systems Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from sqlalchemy import MetaData, Table, Column, DateTime
+
+BASE_TABLE_NAME = 'services'
+NEW_COLUMN_NAME = 'last_seen_up'
+
+
+def upgrade(migrate_engine):
+    meta = MetaData()
+    meta.bind = migrate_engine
+
+    for prefix in ('', 'shadow_'):
+        table = Table(prefix + BASE_TABLE_NAME, meta, autoload=True)
+        new_column = Column(NEW_COLUMN_NAME, DateTime, nullable=True)
+        if not hasattr(table.c, NEW_COLUMN_NAME):
+            table.create_column(new_column)
index 7e372457b3af5dd3c17dee513abc8134b09fb200..2af86554cc610b1314ce278c2e31956bc184421f 100644 (file)
@@ -100,6 +100,7 @@ class Service(BASE, NovaBase):
     report_count = Column(Integer, nullable=False, default=0)
     disabled = Column(Boolean, default=False)
     disabled_reason = Column(String(255))
+    last_seen_up = Column(DateTime, nullable=True)
 
 
 class ComputeNode(BASE, NovaBase):
index ce5dd9b6ceebc942aa35ef8523bbd70deba31cb4..fbeda7d859c1ab4846d248a09f131844bc04467c 100644 (file)
@@ -43,7 +43,8 @@ class Service(base.NovaPersistentObject, base.NovaObject,
     # Version 1.10: Changes behaviour of loading compute_node
     # Version 1.11: Added get_by_host_and_binary
     # Version 1.12: ComputeNode version 1.11
-    VERSION = '1.12'
+    # Version 1.13: Added last_seen_up
+    VERSION = '1.13'
 
     fields = {
         'id': fields.IntegerField(read_only=True),
@@ -55,7 +56,8 @@ class Service(base.NovaPersistentObject, base.NovaObject,
         'disabled_reason': fields.StringField(nullable=True),
         'availability_zone': fields.StringField(nullable=True),
         'compute_node': fields.ObjectField('ComputeNode'),
-        }
+        'last_seen_up': fields.DateTimeField(nullable=True),
+    }
 
     obj_relationships = {
         'compute_node': [('1.1', '1.4'), ('1.3', '1.5'), ('1.5', '1.6'),
@@ -65,6 +67,8 @@ class Service(base.NovaPersistentObject, base.NovaObject,
 
     def obj_make_compatible(self, primitive, target_version):
         _target_version = utils.convert_version_to_tuple(target_version)
+        if _target_version < (1, 13) and 'last_seen_up' in primitive:
+            del primitive['last_seen_up']
         if _target_version < (1, 10):
             target_compute_version = self.obj_calculate_child_version(
                 target_version, 'compute_node')
@@ -194,7 +198,8 @@ class ServiceList(base.ObjectListBase, base.NovaObject):
     # Version 1.8: Service version 1.10
     # Version 1.9: Added get_by_binary() and Service version 1.11
     # Version 1.10: Service version 1.12
-    VERSION = '1.10'
+    # Version 1.11: Service version 1.13
+    VERSION = '1.11'
 
     fields = {
         'objects': fields.ListOfObjectsField('Service'),
@@ -212,6 +217,7 @@ class ServiceList(base.ObjectListBase, base.NovaObject):
         '1.8': '1.10',
         '1.9': '1.11',
         '1.10': '1.12',
+        '1.11': '1.13',
         }
 
     @base.remotable_classmethod
index b204b40f9b3449185ecda0056b425f4c9de99161..d8c29f153f3e2fde4ac18502b6b37b37deb1a299 100644 (file)
@@ -57,7 +57,10 @@ class DbDriver(base.Driver):
         """Moved from nova.utils
         Check whether a service is up based on last heartbeat.
         """
-        last_heartbeat = service_ref['updated_at'] or service_ref['created_at']
+        # Keep checking 'updated_at' if 'last_seen_up' isn't set.
+        # Should be able to use only 'last_seen_up' in the M release
+        last_heartbeat = (service_ref.get('last_seen_up') or
+            service_ref['updated_at'] or service_ref['created_at'])
         if isinstance(last_heartbeat, six.string_types):
             # NOTE(russellb) If this service_ref came in over rpc via
             # conductor, then the timestamp will be a string and needs to be
index f683b98cc70539af8923dcefb294b139c13af0dc..42146bf50ad71b1300deb774815a6ee7b8ac46da 100644 (file)
@@ -281,6 +281,7 @@ class BaseTestCase(test.TestCase):
             'updated_at': None,
             'deleted_at': None,
             'deleted': False,
+            'last_seen_up': None,
         }
         return service
 
index 9059476853f26d84ac559fec4d9236eb627a59c2..9cb15467a5a14134c143fd71e3c2dcabdd168537 100644 (file)
@@ -693,6 +693,18 @@ class NovaMigrationsCheckers(test_migrations.ModelsMigrationsSync,
         self.assertIsNone(fake_migration.migration_type)
         self.assertFalse(fake_migration.hidden)
 
+    def _check_294(self, engine, data):
+        self.assertColumnExists(engine, 'services', 'last_seen_up')
+        self.assertColumnExists(engine, 'shadow_services', 'last_seen_up')
+
+        services = oslodbutils.get_table(engine, 'services')
+        shadow_services = oslodbutils.get_table(
+                engine, 'shadow_services')
+        self.assertIsInstance(services.c.last_seen_up.type,
+                              sqlalchemy.types.DateTime)
+        self.assertIsInstance(shadow_services.c.last_seen_up.type,
+                              sqlalchemy.types.DateTime)
+
 
 class TestNovaMigrationsSQLite(NovaMigrationsCheckers,
                                test_base.DbTestCase,
index d9509a8173d32f641ac9f7db752b36a228066f15..a6fc65d56df062e653e094c3c727d06ffc64d06d 100644 (file)
@@ -1156,8 +1156,8 @@ object_data = {
     'SecurityGroupList': '1.0-a3bb51998e7d2a95b3e613111e853817',
     'SecurityGroupRule': '1.1-ae1da17b79970012e8536f88cb3c6b29',
     'SecurityGroupRuleList': '1.1-521f1aeb7b0cc00d026175509289d020',
-    'Service': '1.12-220e3f7aec641ed08dc4a32af4f38915',
-    'ServiceList': '1.10-2f49ab65571c0edcbf623f664da612c0',
+    'Service': '1.13-bc6c9671a91439e08224c2652da5fc4c',
+    'ServiceList': '1.11-d1728430a30700c143e542b7c75f65b0',
     'Tag': '1.0-616bf44af4a22e853c17b37a758ec73e',
     'TagList': '1.0-e16d65894484b7530b720792ffbbbd02',
     'TestSubclassedObject': '1.6-716fc8b481c9374f7e222de03ba0a621',
index 3d25b8f86cdfe25ed81270fa6898c24b1b3f9f25..71438f4c88c739c90be1fe2bad3b013a2a06e57d 100644 (file)
@@ -36,6 +36,7 @@ fake_service = {
     'report_count': 1,
     'disabled': False,
     'disabled_reason': None,
+    'last_seen_up': None,
     }
 
 OPTIONAL = ['availability_zone', 'compute_node']
index 6e5b98f5bccaf6faf3f57429b0fa566749e4ce66..abea18d96d397953b1d95b707a43a7da8049f226 100644 (file)
@@ -40,6 +40,7 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
 
         # Up (equal)
         now_mock.return_value = fts_func(fake_now)
+        service_ref['last_seen_up'] = fts_func(fake_now - self.down_time)
         service_ref['updated_at'] = fts_func(fake_now - self.down_time)
         service_ref['created_at'] = fts_func(fake_now - self.down_time)
 
@@ -47,17 +48,25 @@ class DBServiceGroupTestCase(test.NoDBTestCase):
         self.assertTrue(result)
 
         # Up
+        service_ref['last_seen_up'] = fts_func(fake_now - self.down_time + 1)
         service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1)
         service_ref['created_at'] = fts_func(fake_now - self.down_time + 1)
         result = self.servicegroup_api.service_is_up(service_ref)
         self.assertTrue(result)
 
         # Down
+        service_ref['last_seen_up'] = fts_func(fake_now - self.down_time - 3)
         service_ref['updated_at'] = fts_func(fake_now - self.down_time - 3)
         service_ref['created_at'] = fts_func(fake_now - self.down_time - 3)
         result = self.servicegroup_api.service_is_up(service_ref)
         self.assertFalse(result)
 
+        # "last_seen_up" says down, "updated_at" says up.
+        # This can happen if we do a service disable/enable while it's down.
+        service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1)
+        result = self.servicegroup_api.service_is_up(service_ref)
+        self.assertFalse(result)
+
     def test_join(self):
         service = mock.MagicMock(report_interval=1)