diff --git a/terraform/services/ecr-cleanup/lambda_src/lambda_function.py b/terraform/services/ecr-cleanup/lambda_src/lambda_function.py index add85d7f..80368918 100644 --- a/terraform/services/ecr-cleanup/lambda_src/lambda_function.py +++ b/terraform/services/ecr-cleanup/lambda_src/lambda_function.py @@ -163,6 +163,12 @@ def get_protected_image_refs(client): for i in range(0, len(task_arns), AWS_BATCH_SIZE): batch = task_arns[i:i + AWS_BATCH_SIZE] resp = client.describe_tasks(cluster=cluster_arn, tasks=batch) + + if resp.get('failures'): + log({'msg': 'describe_tasks error, protected refs may be incomplete', + 'cluster': cluster_arn, + 'failures': resp['failures']}) + for task in resp['tasks']: for container in task.get('containers', []): _, ref = parse_image_ref(container.get('image', '')) @@ -175,7 +181,12 @@ def delete_images(client, repo_name, images): image_ids = [{'imageDigest': img.digest} for img in images] for i in range(0, len(image_ids), AWS_BATCH_SIZE): batch = image_ids[i:i + AWS_BATCH_SIZE] - client.batch_delete_image(repositoryName=repo_name, imageIds=batch) + resp = client.batch_delete_image(repositoryName=repo_name, imageIds=batch) + + if resp.get('failures'): + log({'msg': 'Batch image deletion error', + 'repo': repo_name, + 'failures': resp['failures']}) def log_images_for_deletion(repo, images): """Logs images that would be deleted if the repo were opted in.""" @@ -206,6 +217,11 @@ def lambda_handler(_, __): else: log_images_for_deletion(repo_name, to_delete) log({'msg': f'Cleanup complete for repo: {repo_name}', 'repo': repo_name}) + log({ + 'msg': 'ECR cleanup lambda completed', + 'app': os.environ['APP'], + 'env': os.environ['ENV'], + }) def run(args): """ Prints tags of (or digest of untagged) images that would be deleted. """ diff --git a/terraform/services/ecr-cleanup/lambda_src/test_lambda_function.py b/terraform/services/ecr-cleanup/lambda_src/test_lambda_function.py index fe0f0533..c3970894 100644 --- a/terraform/services/ecr-cleanup/lambda_src/test_lambda_function.py +++ b/terraform/services/ecr-cleanup/lambda_src/test_lambda_function.py @@ -221,6 +221,28 @@ def test_delete_images_multiple_batches(): assert len(first_call_ids) == lambda_function.AWS_BATCH_SIZE assert len(second_call_ids) == 1 +def test_delete_images_logs_failure(capfd): + """When ECR batch_delete_image returns failures, should be logged.""" + mock_ecr = MagicMock() + old_image = make_image('sha256:old', ['asdf-tag'], EXPIRED_DATETIME) + image_delete_failure = { + 'imageId': { + 'imageDigest': old_image.digest, + 'imageTag': old_image.tags[0] + }, + 'failureCode': 'ImageReferencedByManifestList', + 'failureReason': 'Requested image could not be deleted because etc etc' + } + mock_ecr.batch_delete_image.return_value = { + 'imageIds': [], + 'failures': [image_delete_failure] + } + + lambda_function.delete_images(mock_ecr, 'some-repo', [old_image]) + final_log_message = json.loads(capfd.readouterr().out.strip().splitlines()[-1]) + assert "failures" in final_log_message + assert "error" in final_log_message.get("msg") + def test_delete_images_empty_list(): """ Makes sure delete_images does not throw error on empty list. """ mock_ecr = MagicMock() @@ -343,6 +365,58 @@ def test_lambda_handler_deletes_old_unprotected_images(mock_boto3_clients): imageIds=[{'imageDigest': 'sha256:old'}] ) + +def test_lambda_handler_logs_completion_message(mock_boto3_clients, capfd): + """ + Ensures successful execution of lambda_handler() will create log statement + indicating completion of ECR-cleanup. This is used for monitoring in Splunk. + """ + mock_ssm, mock_ecs, mock_ecr = mock_boto3_clients + old_image = make_image('sha256:old', ['old-tag'], EXPIRED_DATETIME).data + _setup_handler_mocks( + mock_ssm, mock_ecs, mock_ecr, + cluster_arns=[CLUSTER_ARN], + task_arns=[f'{CLUSTER_ARN}/task1'], + task_images=[f'{ECR_REGISTRY}/some-repo:protected-tag'], + ecr_images=[old_image], + repo_configs={ 'test': {'some-repo': { 'strategies': (('days_older_than', '', 14,),), + 'opt_in': True } } } + ) + with patch.dict(os.environ, {'APP': 'cdap', 'ENV': 'test'}): + lambda_function.lambda_handler({}, None) + final_log_message = json.loads(capfd.readouterr().out.strip().splitlines()[-1]) + + expected_log_message = 'ECR cleanup lambda completed' + assert expected_log_message in final_log_message["msg"] + + +def test_get_protected_image_refs_logs_describe_tasks_failures(capfd): + """When ECS list_tasks() returns failures, should be logged.""" + task_failure = { + 'arn': f'{CLUSTER_ARN}/task1', + 'reason': 'MISSING' + } + mock_ecs = _make_ecs_mock( + cluster_arns=[CLUSTER_ARN], + task_arns=[f'{CLUSTER_ARN}/task1'], + container_images=[], + ) + mock_ecs.describe_tasks.return_value = { + 'tasks': [], + 'failures': [task_failure] + } + mock_ecs.describe_tasks.return_value = { + 'tasks': [], + 'failures': [task_failure] + } + + lambda_function.get_protected_image_refs(mock_ecs) + final_log_message = json.loads(capfd.readouterr().out.strip().splitlines()[-1]) + assert "failures" in final_log_message + assert "error" in final_log_message.get("msg") + + + def test_lambda_handler_protects_images_in_running_tasks(mock_boto3_clients): """Image referenced by a running ECS task is never deleted even if old.""" mock_ssm, mock_ecs, mock_ecr = mock_boto3_clients