Skip to content

Commit

Permalink
Do not autodelete failed instances (#1665)
Browse files Browse the repository at this point in the history
  • Loading branch information
r4victor authored Sep 6, 2024
1 parent a0ec509 commit f654022
Show file tree
Hide file tree
Showing 8 changed files with 30 additions and 25 deletions.
10 changes: 10 additions & 0 deletions src/dstack/_internal/cli/utils/fleet.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ def get_fleets_table(fleets: List[Fleet], verbose: bool = False) -> Table:
table.add_column("STATUS")
table.add_column("CREATED")

if verbose:
table.add_column("ERROR")

for fleet in fleets:
for i, instance in enumerate(fleet.instances):
resources = ""
Expand Down Expand Up @@ -55,6 +58,13 @@ def get_fleets_table(fleets: List[Fleet], verbose: bool = False) -> Table:
status,
pretty_date(instance.created),
]

if verbose:
error = ""
if instance.status == InstanceStatus.TERMINATED and instance.termination_reason:
error = f"{instance.termination_reason}"
row.append(error)

table.add_row(*row)

if len(fleet.instances) == 0 and fleet.status != FleetStatus.TERMINATING:
Expand Down
1 change: 1 addition & 0 deletions src/dstack/_internal/core/models/pools.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Instance(CoreModel):
hostname: Optional[str] = None
status: InstanceStatus
unreachable: bool = False
termination_reason: Optional[str] = None
created: datetime.datetime
region: Optional[str] = None
price: Optional[float] = None
Expand Down
33 changes: 9 additions & 24 deletions src/dstack/_internal/server/background/tasks/process_instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ async def _terminate_idle_instance(instance: InstanceModel):


async def _add_remote(instance: InstanceModel) -> None:
logger.info("Adding remote instance %s...", instance.name)
logger.info("Adding ssh instance %s...", instance.name)
if instance.status == InstanceStatus.PENDING:
instance.status = InstanceStatus.PROVISIONING

Expand All @@ -196,11 +196,9 @@ async def _add_remote(instance: InstanceModel) -> None:
) + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
if retry_duration_deadline < get_current_datetime():
instance.status = InstanceStatus.TERMINATED
instance.deleted = True
instance.deleted_at = get_current_datetime()
instance.termination_reason = "The proivisioning timeout expired"
instance.termination_reason = "Proivisioning timeout expired"
logger.warning(
"Failed to start the instance in %s seconds. Terminate instance %s",
"Failed to start instance in %s seconds. Terminating...",
PROVISIONING_TIMEOUT_SECONDS,
instance.name,
extra={
Expand All @@ -221,11 +219,9 @@ async def _add_remote(instance: InstanceModel) -> None:
]
except (ValueError, PasswordRequiredException):
instance.status = InstanceStatus.TERMINATED
instance.deleted = True
instance.deleted_at = get_current_datetime()
instance.termination_reason = "Unsupported private SSH key type"
logger.warning(
"Failed to start instance %s: unsupported private SSH key type",
"Failed to add instance %s: unsupported private SSH key type",
instance.name,
extra={
"instance_name": instance.name,
Expand Down Expand Up @@ -253,7 +249,7 @@ async def _add_remote(instance: InstanceModel) -> None:
)
except ProvisioningError as e:
logger.warning(
"Provisioning the instance '%s' could not be completed because of the error: %s",
"Provisioning instance %s could not be completed because of the error: %s",
instance.name,
e,
)
Expand All @@ -275,13 +271,9 @@ async def _add_remote(instance: InstanceModel) -> None:
)
if instance_network is not None and internal_ip is None:
instance.status = InstanceStatus.TERMINATED
instance.deleted = True
instance.deleted_at = get_current_datetime()
instance.termination_reason = (
"Unable to locate the internal ip-address for the given network"
)
instance.termination_reason = "Failed to locate internal IP address on the given network"
logger.warning(
"Failed to configure internal ip-address on instance %s. Terminate it",
"Failed to add instance %s: failed to locate internal IP address on the given network",
instance.name,
extra={
"instance_name": instance.name,
Expand Down Expand Up @@ -391,8 +383,6 @@ async def _create_instance(instance: InstanceModel) -> None:
or instance.instance_configuration is None
):
instance.status = InstanceStatus.TERMINATED
instance.deleted = True
instance.deleted_at = get_current_datetime()
instance.termination_reason = "Empty profile, requirements or instance_configuration"
instance.last_retry_at = get_current_datetime()
logger.warning(
Expand All @@ -417,8 +407,6 @@ async def _create_instance(instance: InstanceModel) -> None:
)
except ValidationError as e:
instance.status = InstanceStatus.TERMINATED
instance.deleted = True
instance.deleted_at = get_current_datetime()
instance.termination_reason = (
f"Error to parse profile, requirements or instance_configuration: {e}"
)
Expand All @@ -440,11 +428,9 @@ async def _create_instance(instance: InstanceModel) -> None:
retry_duration_deadline = _get_retry_duration_deadline(instance, retry)
if get_current_datetime() > retry_duration_deadline:
instance.status = InstanceStatus.TERMINATED
instance.deleted = True
instance.deleted_at = get_current_datetime()
instance.termination_reason = "Retry duration expired"
logger.warning(
"Retry duration expired. Terminate instance %s",
"Retry duration expired. Terminating instance %s",
instance.name,
extra={
"instance_name": instance.name,
Expand Down Expand Up @@ -523,8 +509,6 @@ async def _create_instance(instance: InstanceModel) -> None:

if not should_retry:
instance.status = InstanceStatus.TERMINATED
instance.deleted = True
instance.deleted_at = get_current_datetime()
instance.termination_reason = "No offers found"
logger.info(
"No offers found. Terminated instance %s",
Expand Down Expand Up @@ -760,6 +744,7 @@ def _need_to_wait_fleet_provisioning(instance: InstanceModel) -> bool:
if (
instance.id == instance.fleet.instances[0].id
or instance.fleet.instances[0].job_provisioning_data is not None
or instance.fleet.instances[0].status == InstanceStatus.TERMINATED
):
return False
fleet = fleet_model_to_fleet(instance.fleet)
Expand Down
5 changes: 4 additions & 1 deletion src/dstack/_internal/server/services/fleets.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,4 +418,7 @@ def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[
for instance in fleet_model.instances:
if instance_nums is not None and instance.instance_num not in instance_nums:
continue
instance.status = InstanceStatus.TERMINATING
if instance.status == InstanceStatus.TERMINATED:
instance.deleted = True
else:
instance.status = InstanceStatus.TERMINATING
1 change: 1 addition & 0 deletions src/dstack/_internal/server/services/pools.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
instance_num=instance_model.instance_num,
status=instance_model.status,
unreachable=instance_model.unreachable,
termination_reason=instance_model.termination_reason,
created=instance_model.created_at.replace(tzinfo=timezone.utc),
)

Expand Down
2 changes: 2 additions & 0 deletions src/tests/_internal/server/routers/test_fleets.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ async def test_creates_fleet(self, test_db, session: AsyncSession, client: Async
"hostname": None,
"status": "pending",
"unreachable": False,
"termination_reason": None,
"created": "2023-01-02T03:04:00+00:00",
"pool_name": None,
"backend": None,
Expand Down Expand Up @@ -335,6 +336,7 @@ async def test_creates_ssh_fleet(self, test_db, session: AsyncSession, client: A
"hostname": "1.1.1.1",
"status": "pending",
"unreachable": False,
"termination_reason": None,
"created": "2023-01-02T03:04:00+00:00",
"region": "remote",
"price": 0.0,
Expand Down
2 changes: 2 additions & 0 deletions src/tests/_internal/server/routers/test_pools.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ async def test_show_pool(self, test_db, session: AsyncSession, client: AsyncClie
"hostname": "running_instance.ip",
"status": "idle",
"unreachable": False,
"termination_reason": None,
"created": "2023-01-02T03:04:00+00:00",
"pool_name": None,
"region": "en",
Expand Down Expand Up @@ -489,6 +490,7 @@ async def test_remove_instance(self, test_db, session: AsyncSession, client: Asy
"hostname": "running_instance.ip",
"status": "terminating",
"unreachable": False,
"termination_reason": None,
"created": "2023-01-02T03:04:00+00:00",
"pool_name": None,
"region": "en",
Expand Down
1 change: 1 addition & 0 deletions src/tests/_internal/server/routers/test_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,7 @@ async def test_creates_instance(self, test_db, session: AsyncSession, client: As
"hostname": None,
"status": "pending",
"unreachable": False,
"termination_reason": None,
"created": result["created"],
"pool_name": None,
"region": None,
Expand Down

0 comments on commit f654022

Please sign in to comment.