From: Chris Friesen Date: Fri, 27 Mar 2015 15:23:48 +0000 (-0600) Subject: fix "down" nova-compute service spuriously marked as "up" X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=b9bae02af2168ad64d3b3d28c97c3853cee73272;p=osstest%2Fopenstack-nova.git fix "down" nova-compute service spuriously marked as "up" Currently we use the auto-updated "updated_at" field to determine whether a service is "up". An end-user can cause the "updated_at" field to be updated by disabling or enabling the service, thus potentially causing a service that is unavailable to be detected as "up". This could result in the scheduler trying to assign instances to an unavailable compute node, or in the system mistakenly preventing evacuation of an instance. The fix is to add a new field to explicitly track the timestamp of the last time the service sent in a status report and use that if available when testing whether the service is up. DocImpact This commit will cause a behaviour change for the DB servicegroup driver. It will mean that enabling/disabling the service will cause the "updated_at" field to change (as before) but that will no longer be tied to the "up/down" status of the service. So "nova service-list" could show the service as "down" even if it shows a recent "updated_at". (But this could happen for the other servicegroup drivers already.) Closes-Bug: #1420848 Change-Id: Ied7d47363d0489bca3cf2c711217e1a3b7d24a03 --- diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index d24877389c..daf69ce9be 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -540,6 +540,12 @@ def service_update(context, service_id, values): session = get_session() with session.begin(): service_ref = _service_get(context, service_id, session=session) + # Only servicegroup.drivers.db.DbDriver._report_state() updates + # 'report_count', so if that value changes then store the timestamp + # as the last time we got a state report. + if 'report_count' in values: + if values['report_count'] > service_ref.report_count: + service_ref.last_seen_up = timeutils.utcnow() service_ref.update(values) return service_ref diff --git a/nova/db/sqlalchemy/migrate_repo/versions/294_add_service_heartbeat.py b/nova/db/sqlalchemy/migrate_repo/versions/294_add_service_heartbeat.py new file mode 100644 index 0000000000..20af7b0c11 --- /dev/null +++ b/nova/db/sqlalchemy/migrate_repo/versions/294_add_service_heartbeat.py @@ -0,0 +1,29 @@ +# Copyright (c) 2015 Wind River Systems Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from sqlalchemy import MetaData, Table, Column, DateTime + +BASE_TABLE_NAME = 'services' +NEW_COLUMN_NAME = 'last_seen_up' + + +def upgrade(migrate_engine): + meta = MetaData() + meta.bind = migrate_engine + + for prefix in ('', 'shadow_'): + table = Table(prefix + BASE_TABLE_NAME, meta, autoload=True) + new_column = Column(NEW_COLUMN_NAME, DateTime, nullable=True) + if not hasattr(table.c, NEW_COLUMN_NAME): + table.create_column(new_column) diff --git a/nova/db/sqlalchemy/models.py b/nova/db/sqlalchemy/models.py index 7e372457b3..2af86554cc 100644 --- a/nova/db/sqlalchemy/models.py +++ b/nova/db/sqlalchemy/models.py @@ -100,6 +100,7 @@ class Service(BASE, NovaBase): report_count = Column(Integer, nullable=False, default=0) disabled = Column(Boolean, default=False) disabled_reason = Column(String(255)) + last_seen_up = Column(DateTime, nullable=True) class ComputeNode(BASE, NovaBase): diff --git a/nova/objects/service.py b/nova/objects/service.py index ce5dd9b6ce..fbeda7d859 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -43,7 +43,8 @@ class Service(base.NovaPersistentObject, base.NovaObject, # Version 1.10: Changes behaviour of loading compute_node # Version 1.11: Added get_by_host_and_binary # Version 1.12: ComputeNode version 1.11 - VERSION = '1.12' + # Version 1.13: Added last_seen_up + VERSION = '1.13' fields = { 'id': fields.IntegerField(read_only=True), @@ -55,7 +56,8 @@ class Service(base.NovaPersistentObject, base.NovaObject, 'disabled_reason': fields.StringField(nullable=True), 'availability_zone': fields.StringField(nullable=True), 'compute_node': fields.ObjectField('ComputeNode'), - } + 'last_seen_up': fields.DateTimeField(nullable=True), + } obj_relationships = { 'compute_node': [('1.1', '1.4'), ('1.3', '1.5'), ('1.5', '1.6'), @@ -65,6 +67,8 @@ class Service(base.NovaPersistentObject, base.NovaObject, def obj_make_compatible(self, primitive, target_version): _target_version = utils.convert_version_to_tuple(target_version) + if _target_version < (1, 13) and 'last_seen_up' in primitive: + del primitive['last_seen_up'] if _target_version < (1, 10): target_compute_version = self.obj_calculate_child_version( target_version, 'compute_node') @@ -194,7 +198,8 @@ class ServiceList(base.ObjectListBase, base.NovaObject): # Version 1.8: Service version 1.10 # Version 1.9: Added get_by_binary() and Service version 1.11 # Version 1.10: Service version 1.12 - VERSION = '1.10' + # Version 1.11: Service version 1.13 + VERSION = '1.11' fields = { 'objects': fields.ListOfObjectsField('Service'), @@ -212,6 +217,7 @@ class ServiceList(base.ObjectListBase, base.NovaObject): '1.8': '1.10', '1.9': '1.11', '1.10': '1.12', + '1.11': '1.13', } @base.remotable_classmethod diff --git a/nova/servicegroup/drivers/db.py b/nova/servicegroup/drivers/db.py index b204b40f9b..d8c29f153f 100644 --- a/nova/servicegroup/drivers/db.py +++ b/nova/servicegroup/drivers/db.py @@ -57,7 +57,10 @@ class DbDriver(base.Driver): """Moved from nova.utils Check whether a service is up based on last heartbeat. """ - last_heartbeat = service_ref['updated_at'] or service_ref['created_at'] + # Keep checking 'updated_at' if 'last_seen_up' isn't set. + # Should be able to use only 'last_seen_up' in the M release + last_heartbeat = (service_ref.get('last_seen_up') or + service_ref['updated_at'] or service_ref['created_at']) if isinstance(last_heartbeat, six.string_types): # NOTE(russellb) If this service_ref came in over rpc via # conductor, then the timestamp will be a string and needs to be diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index f683b98cc7..42146bf50a 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -281,6 +281,7 @@ class BaseTestCase(test.TestCase): 'updated_at': None, 'deleted_at': None, 'deleted': False, + 'last_seen_up': None, } return service diff --git a/nova/tests/unit/db/test_migrations.py b/nova/tests/unit/db/test_migrations.py index 9059476853..9cb15467a5 100644 --- a/nova/tests/unit/db/test_migrations.py +++ b/nova/tests/unit/db/test_migrations.py @@ -693,6 +693,18 @@ class NovaMigrationsCheckers(test_migrations.ModelsMigrationsSync, self.assertIsNone(fake_migration.migration_type) self.assertFalse(fake_migration.hidden) + def _check_294(self, engine, data): + self.assertColumnExists(engine, 'services', 'last_seen_up') + self.assertColumnExists(engine, 'shadow_services', 'last_seen_up') + + services = oslodbutils.get_table(engine, 'services') + shadow_services = oslodbutils.get_table( + engine, 'shadow_services') + self.assertIsInstance(services.c.last_seen_up.type, + sqlalchemy.types.DateTime) + self.assertIsInstance(shadow_services.c.last_seen_up.type, + sqlalchemy.types.DateTime) + class TestNovaMigrationsSQLite(NovaMigrationsCheckers, test_base.DbTestCase, diff --git a/nova/tests/unit/objects/test_objects.py b/nova/tests/unit/objects/test_objects.py index d9509a8173..a6fc65d56d 100644 --- a/nova/tests/unit/objects/test_objects.py +++ b/nova/tests/unit/objects/test_objects.py @@ -1156,8 +1156,8 @@ object_data = { 'SecurityGroupList': '1.0-a3bb51998e7d2a95b3e613111e853817', 'SecurityGroupRule': '1.1-ae1da17b79970012e8536f88cb3c6b29', 'SecurityGroupRuleList': '1.1-521f1aeb7b0cc00d026175509289d020', - 'Service': '1.12-220e3f7aec641ed08dc4a32af4f38915', - 'ServiceList': '1.10-2f49ab65571c0edcbf623f664da612c0', + 'Service': '1.13-bc6c9671a91439e08224c2652da5fc4c', + 'ServiceList': '1.11-d1728430a30700c143e542b7c75f65b0', 'Tag': '1.0-616bf44af4a22e853c17b37a758ec73e', 'TagList': '1.0-e16d65894484b7530b720792ffbbbd02', 'TestSubclassedObject': '1.6-716fc8b481c9374f7e222de03ba0a621', diff --git a/nova/tests/unit/objects/test_service.py b/nova/tests/unit/objects/test_service.py index 3d25b8f86c..71438f4c88 100644 --- a/nova/tests/unit/objects/test_service.py +++ b/nova/tests/unit/objects/test_service.py @@ -36,6 +36,7 @@ fake_service = { 'report_count': 1, 'disabled': False, 'disabled_reason': None, + 'last_seen_up': None, } OPTIONAL = ['availability_zone', 'compute_node'] diff --git a/nova/tests/unit/servicegroup/test_db_servicegroup.py b/nova/tests/unit/servicegroup/test_db_servicegroup.py index 6e5b98f5bc..abea18d96d 100644 --- a/nova/tests/unit/servicegroup/test_db_servicegroup.py +++ b/nova/tests/unit/servicegroup/test_db_servicegroup.py @@ -40,6 +40,7 @@ class DBServiceGroupTestCase(test.NoDBTestCase): # Up (equal) now_mock.return_value = fts_func(fake_now) + service_ref['last_seen_up'] = fts_func(fake_now - self.down_time) service_ref['updated_at'] = fts_func(fake_now - self.down_time) service_ref['created_at'] = fts_func(fake_now - self.down_time) @@ -47,17 +48,25 @@ class DBServiceGroupTestCase(test.NoDBTestCase): self.assertTrue(result) # Up + service_ref['last_seen_up'] = fts_func(fake_now - self.down_time + 1) service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1) service_ref['created_at'] = fts_func(fake_now - self.down_time + 1) result = self.servicegroup_api.service_is_up(service_ref) self.assertTrue(result) # Down + service_ref['last_seen_up'] = fts_func(fake_now - self.down_time - 3) service_ref['updated_at'] = fts_func(fake_now - self.down_time - 3) service_ref['created_at'] = fts_func(fake_now - self.down_time - 3) result = self.servicegroup_api.service_is_up(service_ref) self.assertFalse(result) + # "last_seen_up" says down, "updated_at" says up. + # This can happen if we do a service disable/enable while it's down. + service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1) + result = self.servicegroup_api.service_is_up(service_ref) + self.assertFalse(result) + def test_join(self): service = mock.MagicMock(report_interval=1)