diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 12e0b61ff2..35187cc7de 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -133,6 +133,18 @@ jobs: - 5672:5672/tcp # AMQP standard port - 15672:15672/tcp # Management: HTTP, CLI + redis: + # Docker Hub image + image: redis + # Set health checks to wait until redis has started + options: >- + --name "redis" + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 6379:6379/tcp env: # CI st2.conf (with ST2_CI_USER user instead of stanley) ST2_CONF: 'conf/st2.ci.conf' @@ -163,11 +175,6 @@ jobs: - name: Install requirements run: | ./scripts/ci/install-requirements.sh - - name: Run Redis Service Container - timeout-minutes: 2 - run: | - docker run --rm --detach -p 127.0.0.1:6379:6379/tcp --name redis redis:latest - until [ "$(docker inspect -f {{.State.Running}} redis)" == "true" ]; do sleep 0.1; done - name: Setup Tests run: | # prep a ci-specific dev conf file that uses runner instead of stanley @@ -234,9 +241,6 @@ jobs: name: logs-py${{ matrix.python-version }} path: logs.tar.gz retention-days: 7 - - name: Stop Redis Service Container - if: "${{ always() }}" - run: docker rm --force redis || true unit-tests: needs: pre_job @@ -287,6 +291,19 @@ jobs: image: mongo:4.4 ports: - 27017:27017 + redis: + # Docker Hub image + image: redis + # Set health checks to wait until redis has started + options: >- + --name "redis" + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 6379:6379/tcp + rabbitmq: image: rabbitmq:3.8-management @@ -471,21 +488,18 @@ jobs: #- 4369:4369/tcp # epmd # - # Used for the coordination backend for integration tests - # NOTE: To speed things up, we only start redis for integration tests - # where it's needed - # redis: - # # Docker Hub image - # image: redis - # # Set health checks to wait until redis has started - # options: >- - # --name "redis" - # --health-cmd "redis-cli ping" - # --health-interval 10s - # --health-timeout 5s - # --health-retries 5 - # ports: - # - 6379:6379/tcp + redis: + # Docker Hub image + image: redis + # Set health checks to wait until redis has started + options: >- + --name "redis" + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 6379:6379/tcp env: TASK: '${{ matrix.task }}' @@ -538,11 +552,6 @@ jobs: cp conf/st2.dev.conf "${ST2_CONF}" ; sed -i -e "s/stanley/${ST2_CI_USER}/" "${ST2_CONF}" sudo -E ./scripts/ci/add-itest-user-key.sh - - name: Run Redis Service Container - timeout-minutes: 2 - run: | - docker run --rm --detach -p 127.0.0.1:6379:6379/tcp --name redis redis:latest - until [ "$(docker inspect -f {{.State.Running}} redis)" == "true" ]; do sleep 0.1; done - name: Permissions Workaround run: | echo "$ST2_CI_REPO_PATH" @@ -585,9 +594,6 @@ jobs: name: logs-py${{ matrix.python-version }}-nose-${{ matrix.nosetests_node_index }} path: logs.tar.gz retention-days: 7 - - name: Stop Redis Service Container - if: "${{ always() }}" - run: docker rm --force redis || true slack-notification: name: Slack notification for failed master builds diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4b4dd3961a..43f654af7c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -28,6 +28,9 @@ Changed * Update st2client deps: editor and prompt-toolkit. #6189 (by @nzlosh) * Updated dependency oslo.config to prepare for python 3.10 support. #6193 (by @nzlosh) +* Updated unit tests to use redis for coordination instead of the NoOp driver. This will hopefully make CI more stable. #6245 + Contributed by @FileMagic, @guzzijones, and @cognifloyd + Added ~~~~~ * Continue introducing `pants `_ to improve DX (Developer Experience) diff --git a/Makefile b/Makefile index 303cc5111f..9a46a71d4b 100644 --- a/Makefile +++ b/Makefile @@ -53,6 +53,10 @@ COVERAGE_GLOBS_QUOTED := $(foreach glob,$(COVERAGE_GLOBS),'$(glob)') REQUIREMENTS := test-requirements.txt requirements.txt +# Redis config for testing +ST2TESTS_REDIS_HOST := 127.0.0.1 +ST2TESTS_REDIS_PORT := 6379 + # Pin common pip version here across all the targets # Note! Periodic maintenance pip upgrades are required to be up-to-date with the latest pip security fixes and updates PIP_VERSION ?= 24.2 @@ -824,6 +828,8 @@ unit-tests: requirements .unit-tests echo "Running tests in" $$component; \ echo "-----------------------------------------------------------"; \ . $(VIRTUALENV_DIR)/bin/activate; \ + ST2TESTS_REDIS_HOST=$(ST2TESTS_REDIS_HOST) \ + ST2TESTS_REDIS_PORT=$(ST2TESTS_REDIS_PORT) \ nosetests $(NOSE_OPTS) -s -v \ $$component/tests/unit || ((failed+=1)); \ echo "-----------------------------------------------------------"; \ @@ -848,6 +854,8 @@ endif echo "Running tests in" $$component; \ echo "-----------------------------------------------------------"; \ . $(VIRTUALENV_DIR)/bin/activate; \ + ST2TESTS_REDIS_HOST=$(ST2TESTS_REDIS_HOST) \ + ST2TESTS_REDIS_PORT=$(ST2TESTS_REDIS_PORT) \ COVERAGE_FILE=.coverage.unit.$$(echo $$component | tr '/' '.') \ nosetests $(NOSE_OPTS) -s -v $(NOSE_COVERAGE_FLAGS) \ $(NOSE_COVERAGE_PACKAGES) \ diff --git a/st2actions/tests/unit/test_policies.py b/st2actions/tests/unit/test_policies.py index a2c828b39b..09a401e6b0 100644 --- a/st2actions/tests/unit/test_policies.py +++ b/st2actions/tests/unit/test_policies.py @@ -28,6 +28,7 @@ from st2common.transport.publishers import CUDPublisher from st2common.bootstrap import runnersregistrar as runners_registrar from st2tests import ExecutionDbTestCase +from st2tests import config as tests_config from st2tests.fixtures.generic.fixture import PACK_NAME as PACK from st2tests.fixturesloader import FixturesLoader from st2tests.mocks.runners import runner @@ -36,6 +37,8 @@ from st2tests.policies.concurrency import FakeConcurrencyApplicator from st2tests.policies.mock_exception import RaiseExceptionApplicator +# This needs to run before creating FakeConcurrencyApplicator below. +tests_config.parse_args() TEST_FIXTURES = { "actions": ["action1.yaml"], diff --git a/st2actions/tests/unit/test_worker.py b/st2actions/tests/unit/test_worker.py index b335b6f2be..917d0683e1 100644 --- a/st2actions/tests/unit/test_worker.py +++ b/st2actions/tests/unit/test_worker.py @@ -20,12 +20,14 @@ import mock import os from oslo_config import cfg +from tooz.drivers.redis import RedisDriver import tempfile # This import must be early for import-time side-effects. from st2tests.base import DbTestCase import st2actions.worker as actions_worker +import st2tests.config as tests_config from st2common.constants import action as action_constants from st2common.models.db.liveaction import LiveActionDB from st2common.models.system.common import ResourceReference @@ -66,6 +68,35 @@ def setUpClass(cls): ) WorkerTestCase.local_action_db = models["actions"]["local.yaml"] + @staticmethod + def reset_config( + graceful_shutdown=True, # default is True (st2common.config) + exit_still_active_check=None, # default is 300 (st2common.config) + still_active_check_interval=None, # default is 2 (st2common.config) + service_registry=None, # default is False (st2common.config) + ): + tests_config.reset() + tests_config.parse_args() + cfg.CONF.set_override( + name="graceful_shutdown", override=graceful_shutdown, group="actionrunner" + ) + if exit_still_active_check is not None: + cfg.CONF.set_override( + name="exit_still_active_check", + override=exit_still_active_check, + group="actionrunner", + ) + if still_active_check_interval is not None: + cfg.CONF.set_override( + name="still_active_check_interval", + override=still_active_check_interval, + group="actionrunner", + ) + if service_registry is not None: + cfg.CONF.set_override( + name="service_registry", override=service_registry, group="coordination" + ) + def _get_liveaction_model(self, action_db, params): status = action_constants.LIVEACTION_STATUS_REQUESTED start_timestamp = date_utils.get_datetime_utc_now() @@ -116,9 +147,8 @@ def test_non_utf8_action_result_string(self): ) def test_worker_shutdown(self): - cfg.CONF.set_override( - name="graceful_shutdown", override=False, group="actionrunner" - ) + self.reset_config(graceful_shutdown=False) + action_worker = actions_worker.get_worker() temp_file = None @@ -169,14 +199,19 @@ def test_worker_shutdown(self): runner_thread.wait() @mock.patch.object( - coordination.NoOpDriver, + RedisDriver, "get_members", - mock.MagicMock(return_value=coordination.NoOpAsyncResult("member-1")), + mock.MagicMock( + return_value=coordination.NoOpAsyncResult(("member-1", "member-2")) + ), ) def test_worker_graceful_shutdown_with_multiple_runners(self): - cfg.CONF.set_override( - name="graceful_shutdown", override=True, group="actionrunner" + self.reset_config( + exit_still_active_check=10, + still_active_check_interval=1, + service_registry=True, ) + action_worker = actions_worker.get_worker() temp_file = None @@ -234,9 +269,12 @@ def test_worker_graceful_shutdown_with_multiple_runners(self): shutdown_thread.kill() def test_worker_graceful_shutdown_with_single_runner(self): - cfg.CONF.set_override( - name="graceful_shutdown", override=True, group="actionrunner" + self.reset_config( + exit_still_active_check=10, + still_active_check_interval=1, + service_registry=True, ) + action_worker = actions_worker.get_worker() temp_file = None @@ -296,17 +334,13 @@ def test_worker_graceful_shutdown_with_single_runner(self): shutdown_thread.kill() @mock.patch.object( - coordination.NoOpDriver, + RedisDriver, "get_members", - mock.MagicMock(return_value=coordination.NoOpAsyncResult("member-1")), + mock.MagicMock(return_value=coordination.NoOpAsyncResult(("member-1",))), ) def test_worker_graceful_shutdown_exit_timeout(self): - cfg.CONF.set_override( - name="graceful_shutdown", override=True, group="actionrunner" - ) - cfg.CONF.set_override( - name="exit_still_active_check", override=5, group="actionrunner" - ) + self.reset_config(exit_still_active_check=5) + action_worker = actions_worker.get_worker() temp_file = None diff --git a/st2actions/tests/unit/test_workflow_engine.py b/st2actions/tests/unit/test_workflow_engine.py index 68d68a3b2f..ed6440bd57 100644 --- a/st2actions/tests/unit/test_workflow_engine.py +++ b/st2actions/tests/unit/test_workflow_engine.py @@ -24,7 +24,9 @@ from orquesta import statuses as wf_statuses from oslo_config import cfg from tooz import coordination +from tooz.drivers.redis import RedisDriver +import st2tests.config as tests_config from st2actions.workflows import workflows from st2common.bootstrap import actionsregistrar from st2common.bootstrap import runnersregistrar @@ -91,7 +93,41 @@ def setUpClass(cls): for pack in PACKS: actions_registrar.register_from_pack(pack) + @staticmethod + def reset_config( + graceful_shutdown=None, # default is True (st2common.config) + exit_still_active_check=None, # default is 300 (st2common.config) + still_active_check_interval=None, # default is 2 (st2common.config) + service_registry=None, # default is False (st2common.config) + ): + tests_config.reset() + tests_config.parse_args() + if graceful_shutdown is not None: + cfg.CONF.set_override( + name="graceful_shutdown", + override=graceful_shutdown, + group="actionrunner", + ) + if exit_still_active_check is not None: + cfg.CONF.set_override( + name="exit_still_active_check", + override=exit_still_active_check, + group="workflow_engine", + ) + if still_active_check_interval is not None: + cfg.CONF.set_override( + name="still_active_check_interval", + override=still_active_check_interval, + group="workflow_engine", + ) + if service_registry is not None: + cfg.CONF.set_override( + name="service_registry", override=service_registry, group="coordination" + ) + def test_process(self): + self.reset_config() + wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") lv_ac_db = lv_db_models.LiveActionDB(action=wf_meta["name"]) lv_ac_db, ac_ex_db = action_service.request(lv_ac_db) @@ -142,8 +178,10 @@ def test_process(self): lv_ac_db = lv_db_access.LiveAction.get_by_id(str(lv_ac_db.id)) self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_SUCCEEDED) - @mock.patch.object(coordination_service.NoOpDriver, "get_lock") + @mock.patch.object(RedisDriver, "get_lock") def test_process_error_handling(self, mock_get_lock): + self.reset_config(service_registry=True) + expected_errors = [ { "message": "Execution failed. See result for details.", @@ -157,7 +195,6 @@ def test_process_error_handling(self, mock_get_lock): "route": 0, }, ] - mock_get_lock.side_effect = coordination_service.NoOpLock(name="noop") wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") lv_ac_db = lv_db_models.LiveActionDB(action=wf_meta["name"]) @@ -178,8 +215,6 @@ def test_process_error_handling(self, mock_get_lock): task_execution=str(t1_ex_db.id) )[0] mock_get_lock.side_effect = [ - coordination.ToozConnectionError("foobar"), - coordination.ToozConnectionError("foobar"), coordination.ToozConnectionError("foobar"), coordination.ToozConnectionError("foobar"), coordination.ToozConnectionError("foobar"), @@ -200,7 +235,7 @@ def test_process_error_handling(self, mock_get_lock): self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_FAILED) @mock.patch.object( - coordination_service.NoOpDriver, + RedisDriver, "get_lock", ) @mock.patch.object( @@ -209,9 +244,12 @@ def test_process_error_handling(self, mock_get_lock): mock.MagicMock(side_effect=Exception("Unexpected error.")), ) def test_process_error_handling_has_error(self, mock_get_lock): + self.reset_config() + mock_get_lock.side_effect = coordination_service.NoOpLock(name="noop") wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") lv_ac_db = lv_db_models.LiveActionDB(action=wf_meta["name"]) + lv_ac_db, ac_ex_db = action_service.request(lv_ac_db) # Assert action execution is running. @@ -262,15 +300,14 @@ def test_process_error_handling_has_error(self, mock_get_lock): lv_ac_db = lv_db_access.LiveAction.get_by_id(str(lv_ac_db.id)) self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_CANCELED) - @mock.patch.object( - coordination_service.NoOpDriver, - "get_members", - mock.MagicMock(return_value=coordination_service.NoOpAsyncResult("")), - ) def test_workflow_engine_shutdown(self): - cfg.CONF.set_override( - name="service_registry", override=True, group="coordination" + self.reset_config( + graceful_shutdown=True, + exit_still_active_check=4, + still_active_check_interval=1, + service_registry=True, ) + wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") lv_ac_db = lv_db_models.LiveActionDB(action=wf_meta["name"]) lv_ac_db, ac_ex_db = action_service.request(lv_ac_db) @@ -283,11 +320,10 @@ def test_workflow_engine_shutdown(self): )[0] self.assertEqual(wf_ex_db.status, action_constants.LIVEACTION_STATUS_RUNNING) workflow_engine = workflows.get_engine() - eventlet.spawn(workflow_engine.shutdown) # Sleep for few seconds to ensure execution transitions to pausing. - eventlet.sleep(5) + eventlet.sleep(8) lv_ac_db = lv_db_access.LiveAction.get_by_id(str(lv_ac_db.id)) self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_PAUSING) @@ -325,14 +361,15 @@ def test_workflow_engine_shutdown(self): ) @mock.patch.object( - coordination_service.NoOpDriver, + RedisDriver, "get_members", - mock.MagicMock(return_value=coordination_service.NoOpAsyncResult("member-1")), + mock.MagicMock( + return_value=coordination_service.NoOpAsyncResult(("member-1",)) + ), ) def test_workflow_engine_shutdown_with_multiple_members(self): - cfg.CONF.set_override( - name="service_registry", override=True, group="coordination" - ) + self.reset_config(service_registry=True) + wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") lv_ac_db = lv_db_models.LiveActionDB(action=wf_meta["name"]) lv_ac_db, ac_ex_db = action_service.request(lv_ac_db) @@ -373,9 +410,8 @@ def test_workflow_engine_shutdown_with_multiple_members(self): self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_RUNNING) def test_workflow_engine_shutdown_with_service_registry_disabled(self): - cfg.CONF.set_override( - name="service_registry", override=False, group="coordination" - ) + self.reset_config(service_registry=False) + wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") lv_ac_db = lv_db_models.LiveActionDB(action=wf_meta["name"]) lv_ac_db, ac_ex_db = action_service.request(lv_ac_db) @@ -399,17 +435,13 @@ def test_workflow_engine_shutdown_with_service_registry_disabled(self): self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_RUNNING) @mock.patch.object( - coordination_service.NoOpDriver, + RedisDriver, "get_lock", mock.MagicMock(return_value=coordination_service.NoOpLock(name="noop")), ) def test_workflow_engine_shutdown_first_then_start(self): - cfg.CONF.set_override( - name="service_registry", override=True, group="coordination" - ) - cfg.CONF.set_override( - name="exit_still_active_check", override=0, group="workflow_engine" - ) + self.reset_config(service_registry=True, exit_still_active_check=0) + wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") lv_ac_db = lv_db_models.LiveActionDB(action=wf_meta["name"]) lv_ac_db, ac_ex_db = action_service.request(lv_ac_db) @@ -456,17 +488,13 @@ def test_workflow_engine_shutdown_first_then_start(self): ) @mock.patch.object( - coordination_service.NoOpDriver, + RedisDriver, "get_lock", mock.MagicMock(return_value=coordination_service.NoOpLock(name="noop")), ) def test_workflow_engine_start_first_then_shutdown(self): - cfg.CONF.set_override( - name="service_registry", override=True, group="coordination" - ) - cfg.CONF.set_override( - name="exit_still_active_check", override=0, group="workflow_engine" - ) + self.reset_config(service_registry=True, exit_still_active_check=0) + wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") lv_ac_db = lv_db_models.LiveActionDB(action=wf_meta["name"]) lv_ac_db, ac_ex_db = action_service.request(lv_ac_db) @@ -480,15 +508,15 @@ def test_workflow_engine_start_first_then_shutdown(self): self.assertEqual(wf_ex_db.status, action_constants.LIVEACTION_STATUS_RUNNING) workflow_engine = workflows.get_engine() + RedisDriver.get_members = mock.MagicMock( + return_value=coordination_service.NoOpAsyncResult(("member-1",)) + ) + workflow_engine._delay = 0 # Initiate start first eventlet.spawn(workflow_engine.start, True) eventlet.spawn_after(1, workflow_engine.shutdown) - coordination_service.NoOpDriver.get_members = mock.MagicMock( - return_value=coordination_service.NoOpAsyncResult("member-1") - ) - lv_ac_db = lv_db_access.LiveAction.get_by_id(str(lv_ac_db.id)) # Startup routine acquires the lock first and shutdown routine sees a new member present in registry. diff --git a/st2common/tests/unit/services/test_workflow_service_retries.py b/st2common/tests/unit/services/test_workflow_service_retries.py index ca2fab6f9f..4de49fcf3f 100644 --- a/st2common/tests/unit/services/test_workflow_service_retries.py +++ b/st2common/tests/unit/services/test_workflow_service_retries.py @@ -27,6 +27,7 @@ from orquesta import statuses as wf_statuses from tooz import coordination +from tooz.drivers.redis import RedisDriver import st2tests @@ -123,7 +124,7 @@ def setUpClass(cls): for pack in PACKS: actions_registrar.register_from_pack(pack) - @mock.patch.object(coord_svc.NoOpDriver, "get_lock") + @mock.patch.object(RedisDriver, "get_lock") def test_recover_from_coordinator_connection_error(self, mock_get_lock): mock_get_lock.side_effect = coord_svc.NoOpLock(name="noop") wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") @@ -157,7 +158,7 @@ def test_recover_from_coordinator_connection_error(self, mock_get_lock): tk1_ex_db = wf_db_access.TaskExecution.get_by_id(tk1_ex_db.id) self.assertEqual(tk1_ex_db.status, wf_statuses.SUCCEEDED) - @mock.patch.object(coord_svc.NoOpDriver, "get_lock") + @mock.patch.object(RedisDriver, "get_lock") def test_retries_exhausted_from_coordinator_connection_error(self, mock_get_lock): mock_get_lock.side_effect = coord_svc.NoOpLock(name="noop") wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml") @@ -195,7 +196,6 @@ def test_retries_exhausted_from_coordinator_connection_error(self, mock_get_lock "update_task_state", mock.MagicMock( side_effect=[ - mongoengine.connection.ConnectionFailure(), mongoengine.connection.ConnectionFailure(), None, ] diff --git a/st2common/tests/unit/test_service_setup.py b/st2common/tests/unit/test_service_setup.py index 0fa413ca5d..4638be2f88 100644 --- a/st2common/tests/unit/test_service_setup.py +++ b/st2common/tests/unit/test_service_setup.py @@ -217,6 +217,7 @@ def test_deregister_service_when_service_registry_enabled(self): members = coordinator.get_members(service.encode("utf-8")) self.assertEqual(len(list(members.get())), 1) service_setup.deregister_service(service) + members = coordinator.get_members(service.encode("utf-8")) self.assertEqual(len(list(members.get())), 0) def test_deregister_service_when_service_registry_disables(self): diff --git a/st2tests/st2tests/config.py b/st2tests/st2tests/config.py index bef5197bcf..40c82151e9 100644 --- a/st2tests/st2tests/config.py +++ b/st2tests/st2tests/config.py @@ -139,12 +139,18 @@ def _override_scheduler_opts(): def _override_coordinator_opts(noop=False): driver = None if noop else "zake://" + + redis_host = os.environ.get("ST2TESTS_REDIS_HOST", False) + if redis_host: + redis_port = os.environ.get("ST2TESTS_REDIS_PORT", "6379") + driver = f"redis://{redis_host}:{redis_port}" + CONF.set_override(name="url", override=driver, group="coordination") CONF.set_override(name="lock_timeout", override=1, group="coordination") def _override_workflow_engine_opts(): - cfg.CONF.set_override("retry_stop_max_msec", 500, group="workflow_engine") + cfg.CONF.set_override("retry_stop_max_msec", 200, group="workflow_engine") cfg.CONF.set_override("retry_wait_fixed_msec", 100, group="workflow_engine") cfg.CONF.set_override("retry_max_jitter_msec", 100, group="workflow_engine") cfg.CONF.set_override("gc_max_idle_sec", 1, group="workflow_engine")