Currently we use the auto-updated "updated_at" field to determine
whether a service is "up". An end-user can cause the "updated_at"
field to be updated by disabling or enabling the service, thus
potentially causing a service that is unavailable to be detected
as "up". This could result in the scheduler trying to assign
instances to an unavailable compute node, or in the system
mistakenly preventing evacuation of an instance.
The fix is to add a new field to explicitly track the timestamp of
the last time the service sent in a status report and use that if
available when testing whether the service is up.
DocImpact
This commit will cause a behaviour change for the DB servicegroup
driver. It will mean that enabling/disabling the service will
cause the "updated_at" field to change (as before) but that will
no longer be tied to the "up/down" status of the service. So
"nova service-list" could show the service as "down" even if it
shows a recent "updated_at". (But this could happen for the other
servicegroup drivers already.)
Closes-Bug: #
1420848
Change-Id: Ied7d47363d0489bca3cf2c711217e1a3b7d24a03
session = get_session()
with session.begin():
service_ref = _service_get(context, service_id, session=session)
+ # Only servicegroup.drivers.db.DbDriver._report_state() updates
+ # 'report_count', so if that value changes then store the timestamp
+ # as the last time we got a state report.
+ if 'report_count' in values:
+ if values['report_count'] > service_ref.report_count:
+ service_ref.last_seen_up = timeutils.utcnow()
service_ref.update(values)
return service_ref
--- /dev/null
+# Copyright (c) 2015 Wind River Systems Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from sqlalchemy import MetaData, Table, Column, DateTime
+
+BASE_TABLE_NAME = 'services'
+NEW_COLUMN_NAME = 'last_seen_up'
+
+
+def upgrade(migrate_engine):
+ meta = MetaData()
+ meta.bind = migrate_engine
+
+ for prefix in ('', 'shadow_'):
+ table = Table(prefix + BASE_TABLE_NAME, meta, autoload=True)
+ new_column = Column(NEW_COLUMN_NAME, DateTime, nullable=True)
+ if not hasattr(table.c, NEW_COLUMN_NAME):
+ table.create_column(new_column)
report_count = Column(Integer, nullable=False, default=0)
disabled = Column(Boolean, default=False)
disabled_reason = Column(String(255))
+ last_seen_up = Column(DateTime, nullable=True)
class ComputeNode(BASE, NovaBase):
# Version 1.10: Changes behaviour of loading compute_node
# Version 1.11: Added get_by_host_and_binary
# Version 1.12: ComputeNode version 1.11
- VERSION = '1.12'
+ # Version 1.13: Added last_seen_up
+ VERSION = '1.13'
fields = {
'id': fields.IntegerField(read_only=True),
'disabled_reason': fields.StringField(nullable=True),
'availability_zone': fields.StringField(nullable=True),
'compute_node': fields.ObjectField('ComputeNode'),
- }
+ 'last_seen_up': fields.DateTimeField(nullable=True),
+ }
obj_relationships = {
'compute_node': [('1.1', '1.4'), ('1.3', '1.5'), ('1.5', '1.6'),
def obj_make_compatible(self, primitive, target_version):
_target_version = utils.convert_version_to_tuple(target_version)
+ if _target_version < (1, 13) and 'last_seen_up' in primitive:
+ del primitive['last_seen_up']
if _target_version < (1, 10):
target_compute_version = self.obj_calculate_child_version(
target_version, 'compute_node')
# Version 1.8: Service version 1.10
# Version 1.9: Added get_by_binary() and Service version 1.11
# Version 1.10: Service version 1.12
- VERSION = '1.10'
+ # Version 1.11: Service version 1.13
+ VERSION = '1.11'
fields = {
'objects': fields.ListOfObjectsField('Service'),
'1.8': '1.10',
'1.9': '1.11',
'1.10': '1.12',
+ '1.11': '1.13',
}
@base.remotable_classmethod
"""Moved from nova.utils
Check whether a service is up based on last heartbeat.
"""
- last_heartbeat = service_ref['updated_at'] or service_ref['created_at']
+ # Keep checking 'updated_at' if 'last_seen_up' isn't set.
+ # Should be able to use only 'last_seen_up' in the M release
+ last_heartbeat = (service_ref.get('last_seen_up') or
+ service_ref['updated_at'] or service_ref['created_at'])
if isinstance(last_heartbeat, six.string_types):
# NOTE(russellb) If this service_ref came in over rpc via
# conductor, then the timestamp will be a string and needs to be
'updated_at': None,
'deleted_at': None,
'deleted': False,
+ 'last_seen_up': None,
}
return service
self.assertIsNone(fake_migration.migration_type)
self.assertFalse(fake_migration.hidden)
+ def _check_294(self, engine, data):
+ self.assertColumnExists(engine, 'services', 'last_seen_up')
+ self.assertColumnExists(engine, 'shadow_services', 'last_seen_up')
+
+ services = oslodbutils.get_table(engine, 'services')
+ shadow_services = oslodbutils.get_table(
+ engine, 'shadow_services')
+ self.assertIsInstance(services.c.last_seen_up.type,
+ sqlalchemy.types.DateTime)
+ self.assertIsInstance(shadow_services.c.last_seen_up.type,
+ sqlalchemy.types.DateTime)
+
class TestNovaMigrationsSQLite(NovaMigrationsCheckers,
test_base.DbTestCase,
'SecurityGroupList': '1.0-a3bb51998e7d2a95b3e613111e853817',
'SecurityGroupRule': '1.1-ae1da17b79970012e8536f88cb3c6b29',
'SecurityGroupRuleList': '1.1-521f1aeb7b0cc00d026175509289d020',
- 'Service': '1.12-220e3f7aec641ed08dc4a32af4f38915',
- 'ServiceList': '1.10-2f49ab65571c0edcbf623f664da612c0',
+ 'Service': '1.13-bc6c9671a91439e08224c2652da5fc4c',
+ 'ServiceList': '1.11-d1728430a30700c143e542b7c75f65b0',
'Tag': '1.0-616bf44af4a22e853c17b37a758ec73e',
'TagList': '1.0-e16d65894484b7530b720792ffbbbd02',
'TestSubclassedObject': '1.6-716fc8b481c9374f7e222de03ba0a621',
'report_count': 1,
'disabled': False,
'disabled_reason': None,
+ 'last_seen_up': None,
}
OPTIONAL = ['availability_zone', 'compute_node']
# Up (equal)
now_mock.return_value = fts_func(fake_now)
+ service_ref['last_seen_up'] = fts_func(fake_now - self.down_time)
service_ref['updated_at'] = fts_func(fake_now - self.down_time)
service_ref['created_at'] = fts_func(fake_now - self.down_time)
self.assertTrue(result)
# Up
+ service_ref['last_seen_up'] = fts_func(fake_now - self.down_time + 1)
service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1)
service_ref['created_at'] = fts_func(fake_now - self.down_time + 1)
result = self.servicegroup_api.service_is_up(service_ref)
self.assertTrue(result)
# Down
+ service_ref['last_seen_up'] = fts_func(fake_now - self.down_time - 3)
service_ref['updated_at'] = fts_func(fake_now - self.down_time - 3)
service_ref['created_at'] = fts_func(fake_now - self.down_time - 3)
result = self.servicegroup_api.service_is_up(service_ref)
self.assertFalse(result)
+ # "last_seen_up" says down, "updated_at" says up.
+ # This can happen if we do a service disable/enable while it's down.
+ service_ref['updated_at'] = fts_func(fake_now - self.down_time + 1)
+ result = self.servicegroup_api.service_is_up(service_ref)
+ self.assertFalse(result)
+
def test_join(self):
service = mock.MagicMock(report_interval=1)