diff --git a/.circleci/config.yml b/.circleci/config.yml index ec1daccbe..51cf22dc4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -379,18 +379,23 @@ workflows: - test-webapp: requires: - generate-graphql-schema - sci-test-on-demand: - jobs: - - generate-graphql-schema - - test-engine-sci: - requires: - - generate-graphql-schema - filters: - branches: - only: - - master - # Add additional branches to run sci-test against here. Clean up old branches if you notice any. - - feat/lithops-ds-sort-in-ram + +## sci-test on demand is currently failing as it either uses more RAM than CircleCI allows, or takes >15 minutes if +## parallelism is reduced to minimize memory usage. +## Re-test after https://github.com/metaspace2020/metaspace/issues/763 has been addressed +# +# sci-test-on-demand: +# jobs: +# - generate-graphql-schema +# - test-engine-sci: +# requires: +# - generate-graphql-schema +# filters: +# branches: +# only: +# - master +# # Add additional branches to run sci-test against here. Clean up old branches if you notice any. +# - feat/lithops-ds-sort-in-ram ## Disabled until we have time to redo these tests diff --git a/.github/workflows/pythonpublish.yml b/.github/workflows/pythonpublish.yml index 648381bd5..48f113152 100644 --- a/.github/workflows/pythonpublish.yml +++ b/.github/workflows/pythonpublish.yml @@ -1,12 +1,7 @@ name: Publish python-client on: - workflow_dispatch: # Allow manual trigger - push: - paths: - - 'metaspace/python-client/metaspace/__init__.py' - branches: - - release + workflow_dispatch: # Manual trigger jobs: deploy: diff --git a/ansible/roles/sm_lithops_daemon/templates/sm-lithops-daemon.supervisor.j2 b/ansible/roles/sm_lithops_daemon/templates/sm-lithops-daemon.supervisor.j2 index 9bc5dfef0..a330406fa 100644 --- a/ansible/roles/sm_lithops_daemon/templates/sm-lithops-daemon.supervisor.j2 +++ b/ansible/roles/sm_lithops_daemon/templates/sm-lithops-daemon.supervisor.j2 @@ -9,3 +9,4 @@ redirect_stderr = true stdout_logfile = {{ sm_home }}/logs/{{ sm_lithops_daemon_app_name }}-%(process_num)s.log numprocs = {{ sm_lithops_daemon_nprocs }} startsecs = 10 +killasgroup = true # Clean up child processes from multiprocessing \ No newline at end of file diff --git a/metaspace/engine/sm/engine/daemons/lithops.py b/metaspace/engine/sm/engine/daemons/lithops.py index 1c924a0eb..be35ac974 100644 --- a/metaspace/engine/sm/engine/daemons/lithops.py +++ b/metaspace/engine/sm/engine/daemons/lithops.py @@ -5,11 +5,11 @@ from traceback import format_exc from sm.engine.annotation_lithops.executor import LithopsStalledException +from sm.engine.config import SMConfig from sm.engine.daemons.actions import DaemonActionStage, DaemonAction from sm.engine.dataset import DatasetStatus -from sm.engine.errors import ImzMLError, AnnotationError +from sm.engine.errors import AnnotationError from sm.engine.queue import QueueConsumer, QueuePublisher -from sm.engine.config import SMConfig from sm.rest.dataset_manager import DatasetActionPriority @@ -43,29 +43,30 @@ def _on_success(self, msg): self.logger.info(' SM lithops daemon: success') self._manager.post_to_slack('dart', f' [v] Annotation succeeded: {json.dumps(msg)}') + # pylint: disable=unused-argument def _on_failure(self, msg, e): - if isinstance(e, LithopsStalledException): - # Requeue the message so it retries, then exit the process and let supervisor restart it - if msg.get('retry_attempt', 0) < 1: - self.logger.info('Lithops stalled. Retrying') - self._lithops_queue_pub.publish( - {**msg, 'retry_attempt': msg.get('retry_attempt', 0) + 1} - ) - else: - self.logger.critical('Lithops stalled. Retrying on Spark') - self._annot_queue_pub.publish(msg) - + exc = format_exc(limit=10) + # Requeue the message so it retries + if msg.get('retry_attempt', 0) < 1: + self.logger.warning(f'Lithops annotation failed, retrying.\n{exc}') + self._lithops_queue_pub.publish( + {**msg, 'retry_attempt': msg.get('retry_attempt', 0) + 1} + ) self._manager.post_to_slack( - 'bomb', f" [x] Lithops stall: {json.dumps(msg)}\n```{format_exc(limit=10)}```" + 'bomb', f" [x] Annotation failed, retrying: {json.dumps(msg)}\n```{exc}```", ) - os.kill(os.getpid(), signal.SIGINT) - else: - self._manager.ds_failure_handler(msg, e) + self.logger.critical(f'Lithops annotation failed. Falling back to Spark\n{exc}') + self._annot_queue_pub.publish(msg) + + self._manager.post_to_slack( + 'bomb', + f" [x] Annotation failed, retrying on Spark: {json.dumps(msg)}\n```{exc}```", + ) - if 'email' in msg: - traceback = e.__cause__.traceback if isinstance(e.__cause__, ImzMLError) else None - self._manager.send_failed_email(msg, traceback) + # Exit the process and let supervisor restart it, in case Lithops was left in + # an unrecoverable state + os.kill(os.getpid(), signal.SIGINT) def _callback(self, msg): try: diff --git a/metaspace/engine/sm/engine/daemons/update.py b/metaspace/engine/sm/engine/daemons/update.py index e3e8b9d5b..68010f237 100644 --- a/metaspace/engine/sm/engine/daemons/update.py +++ b/metaspace/engine/sm/engine/daemons/update.py @@ -9,8 +9,7 @@ class SMUpdateDaemon: - """ Reads messages from the update queue and does indexing/update/delete - """ + """Reads messages from the update queue and does indexing/update/delete""" logger = logging.getLogger('update-daemon') @@ -37,7 +36,7 @@ def _on_success(self, msg): if msg['action'] in [DaemonAction.UPDATE, DaemonAction.INDEX]: msg['web_app_link'] = self._manager.create_web_app_link(msg) - if msg['action'] != DaemonAction.UPDATE: + if msg['action'] == DaemonAction.DELETE: self._manager.post_to_slack( 'dart', f' [v] Succeeded to {msg["action"]}: {json.dumps(msg)}' ) @@ -54,9 +53,6 @@ def _on_failure(self, msg, e): def _callback(self, msg): try: self.logger.info(f' SM update daemon received a message: {msg}') - self._manager.post_to_slack( - 'new', f' [v] New {msg["action"]} message: {json.dumps(msg)}' - ) ds = self._manager.load_ds(msg['ds_id']) self._manager.notify_update(ds.id, msg['action'], DaemonActionStage.STARTED) diff --git a/metaspace/graphql/src/modules/auth/operation.ts b/metaspace/graphql/src/modules/auth/operation.ts index 7876c0f46..8afedbc02 100644 --- a/metaspace/graphql/src/modules/auth/operation.ts +++ b/metaspace/graphql/src/modules/auth/operation.ts @@ -76,7 +76,7 @@ export const findUserByApiKey = async(apiKey: string, groups = false) => { return (await query.getOne()) || null } -export const createExpiry = (minutes = 10): Moment => { +export const createExpiry = (minutes = 60): Moment => { return utc().add(minutes, 'minutes') }