diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index 195c7a99409..493d5f20a81 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -2,9 +2,11 @@ import datetime import fnmatch +import json import os.path import re from functools import partial +from io import BytesIO import regex import structlog @@ -29,6 +31,7 @@ from readthedocs.builds.constants import EXTERNAL_VERSION_STATES from readthedocs.builds.constants import INTERNAL from readthedocs.builds.constants import LATEST +from readthedocs.builds.constants import MAX_BUILD_COMMAND_SIZE from readthedocs.builds.constants import PREDEFINED_MATCH_ARGS from readthedocs.builds.constants import PREDEFINED_MATCH_ARGS_VALUES from readthedocs.builds.constants import STABLE @@ -73,6 +76,7 @@ from readthedocs.projects.ordering import ProjectItemPositionManager from readthedocs.projects.validators import validate_build_config_file from readthedocs.projects.version_handling import determine_stable_version +from readthedocs.storage import build_commands_storage log = structlog.get_logger(__name__) @@ -386,8 +390,15 @@ def get_absolute_url(self): ) def delete(self, *args, **kwargs): + from readthedocs.builds.tasks import remove_build_commands_storage_paths from readthedocs.projects.tasks.utils import clean_project_resources + # Remove build artifacts from storage for cold storage builds. + paths_to_delete = [] + for build in self.builds.filter(cold_storage=True).iterator(): + paths_to_delete.append(build.storage_path) + remove_build_commands_storage_paths.delay(paths_to_delete) + log.info("Removing files for version.", version_slug=self.slug) clean_project_resources(self.project, self) super().delete(*args, **kwargs) @@ -869,6 +880,68 @@ def save(self, *args, **kwargs): # noqa self._readthedocs_yaml_config = None self._readthedocs_yaml_config_changed = False + def delete(self, *args, **kwargs): + # Delete from storage if the build steps are stored outside the database. + if self.cold_storage: + try: + build_commands_storage.delete(self.storage_path) + except Exception: + log.info( + "Failed to delete build commands from storage.", + build_id=self.id, + storage_path=self.storage_path, + exc_info=True, + ) + return super().delete(*args, **kwargs) + + def move_to_cold_storage(self): + """ + Move build steps to cold storage if they are not already there. + + Build steps are removed from the database and stored in a file in the storage backend. + This is useful for old builds that are not accessed frequently, to save space in the database. + """ + from readthedocs.api.v2.serializers import BuildCommandSerializer + + if self.cold_storage: + return + + commands = BuildCommandSerializer(self.commands, many=True).data + if commands: + for cmd in commands: + if len(cmd["output"]) > MAX_BUILD_COMMAND_SIZE: + cmd["output"] = cmd["output"][-MAX_BUILD_COMMAND_SIZE:] + cmd["output"] = ( + "\n\n" + "... (truncated) ..." + "\n\n" + "Command output too long. Truncated to last 1MB." + "\n\n" + cmd["output"] + ) + log.debug("Truncating build command for build.", build_id=self.id) + output = BytesIO(json.dumps(commands).encode("utf8")) + try: + build_commands_storage.save(name=self.storage_path, content=output) + self.commands.all().delete() + except IOError: + log.exception("Cold Storage save failure") + return + + self.cold_storage = True + self.save() + + @property + def storage_path(self): + """ + Storage path where the build commands will be stored. + + The path is in the format: /.json + + Example: 2024-01-01/1111.json + """ + date = self.date.date() + return f"{date}/{self.id}.json" + def get_absolute_url(self): return reverse("builds_detail", args=[self.project.slug, self.pk]) diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index 7de5d51575b..ab6dd95a393 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -79,6 +79,16 @@ def delete_directory(self, path): if filename: self.delete(self.join(path, filename)) + def delete_paths(self, paths): + """ + Delete multiple paths from storage. + + This is a convenience method to delete multiple paths at once, which can be more efficient + for some storage backends that support batch deletion (eg. S3). + """ + for path in paths: + self.delete(path) + def copy_directory(self, source, destination): """ Copy a directory recursively to storage. diff --git a/readthedocs/builds/tasks.py b/readthedocs/builds/tasks.py index c9b159f641d..d4eab3e7f2c 100644 --- a/readthedocs/builds/tasks.py +++ b/readthedocs/builds/tasks.py @@ -1,9 +1,9 @@ import json -from io import BytesIO import requests import structlog from django.conf import settings +from django.core.cache import cache from django.urls import reverse from django.utils import timezone from django.utils.translation import gettext_lazy as _ @@ -11,19 +11,18 @@ from oauthlib.oauth2.rfc6749.errors import TokenExpiredError from readthedocs import __version__ -from readthedocs.api.v2.serializers import BuildCommandSerializer from readthedocs.api.v2.utils import delete_versions_from_db from readthedocs.api.v2.utils import get_deleted_active_versions from readthedocs.api.v2.utils import run_version_automation_rules from readthedocs.api.v2.utils import sync_versions_to_db from readthedocs.builds.constants import BRANCH +from readthedocs.builds.constants import BUILD_FINAL_STATES from readthedocs.builds.constants import BUILD_STATUS_FAILURE from readthedocs.builds.constants import BUILD_STATUS_PENDING from readthedocs.builds.constants import BUILD_STATUS_SUCCESS from readthedocs.builds.constants import EXTERNAL from readthedocs.builds.constants import EXTERNAL_VERSION_STATE_CLOSED from readthedocs.builds.constants import LOCK_EXPIRE -from readthedocs.builds.constants import MAX_BUILD_COMMAND_SIZE from readthedocs.builds.constants import TAG from readthedocs.builds.models import Build from readthedocs.builds.models import BuildConfig @@ -32,6 +31,7 @@ from readthedocs.builds.utils import memcache_lock from readthedocs.core.utils import send_email from readthedocs.core.utils import trigger_build +from readthedocs.core.utils.db import delete_in_batches from readthedocs.integrations.models import HttpExchange from readthedocs.notifications.models import Notification from readthedocs.oauth.notifications import MESSAGE_OAUTH_BUILD_STATUS_FAILURE @@ -45,7 +45,7 @@ @app.task(queue="web", bind=True) -def archive_builds_task(self, days=14, limit=200, delete=False): +def archive_builds_task(self, days=14, limit=200): """ Task to archive old builds to cold storage. @@ -71,33 +71,8 @@ def archive_builds_task(self, days=14, limit=200, delete=False): .prefetch_related("commands") .only("date", "cold_storage")[:limit] ) - for build in queryset: - commands = BuildCommandSerializer(build.commands, many=True).data - if commands: - for cmd in commands: - if len(cmd["output"]) > MAX_BUILD_COMMAND_SIZE: - cmd["output"] = cmd["output"][-MAX_BUILD_COMMAND_SIZE:] - cmd["output"] = ( - "\n\n" - "... (truncated) ..." - "\n\n" - "Command output too long. Truncated to last 1MB." - "\n\n" + cmd["output"] - ) # noqa - log.debug("Truncating build command for build.", build_id=build.id) - output = BytesIO(json.dumps(commands).encode("utf8")) - filename = "{date}/{id}.json".format(date=str(build.date.date()), id=build.id) - try: - build_commands_storage.save(name=filename, content=output) - if delete: - build.commands.all().delete() - except IOError: - log.exception("Cold Storage save failure") - continue - - build.cold_storage = True - build.save() + build.move_to_cold_storage() @app.task(queue="web") @@ -606,3 +581,85 @@ def remove_orphan_build_config(): count = orphan_buildconfigs.count() orphan_buildconfigs.delete() log.info("Removed orphan BuildConfig objects.", count=count) + + +@app.task(queue="web") +def delete_old_build_objects( + days=360 * 3, keep_recent=250, limit=20_000, max_projects=5_000, start=None +): + """ + Delete old Build objects that are not needed anymore. + + We keep the most recent `keep_recent` builds per version, + and delete builds that are older than `days` days. + + :param limit: Maximum number of builds to delete in one execution. + Keep our DB from being overwhelmed by deleting too many builds at once. + :param max_projects: Maximum number of projects to process in one execution. + Avoids iterating over all projects in one execution. + :param start: Starting index for processing projects. + Normally, this value comes from the cache, and is updated after each execution to continue from where it left off. + But it can also be set to a specific value for manual execution. + """ + cache_key = "rtd-task:delete_old_build_objects_start" + max_count = Project.objects.count() + use_cache = start is None + if use_cache: + start = cache.get(cache_key, 0) + if start >= max_count: + start = 0 + end = start + max_projects + if use_cache: + cache.set(cache_key, end) + + cutoff_date = timezone.now() - timezone.timedelta(days=days) + projects = Project.objects.all().order_by("pub_date").only("pk")[start:end] + for project in projects: + # Delete builds associated with versions, keeping + # the most recent `keep_recent` builds per version. + for version in project.versions.exclude(builds=None).only("pk").iterator(): + builds_to_delete = version.builds.filter( + state__in=BUILD_FINAL_STATES, + date__lt=cutoff_date, + ).order_by("-date") + limit -= _delete_builds(builds_to_delete, start=keep_recent, end=keep_recent + limit) + if limit <= 0: + return + + # Delete builds that are not associated with any version, + # keeping the most recent `keep_recent` builds per project. + builds_to_delete = project.builds.filter( + version=None, state__in=BUILD_FINAL_STATES, date__lt=cutoff_date + ).order_by("-date") + limit -= _delete_builds(builds_to_delete, start=keep_recent, end=keep_recent + limit) + if limit <= 0: + return + + +def _delete_builds(builds, start: int, end: int) -> int: + """ + Delete builds from the queryset, starting from `start` index and ending at `end` index. + + This also deletes the storage paths associated with the builds if they are in cold storage. + + :returns: The number of builds deleted. + """ + # NOTE: we can't use a filter over an sliced queryset. + paths_to_delete = [] + for build in builds[start:end]: + if build.cold_storage: + paths_to_delete.append(build.storage_path) + + _, deleted = delete_in_batches(builds, start=start, end=end) + remove_build_commands_storage_paths(paths_to_delete) + return deleted.get("builds.Build", 0) + + +@app.task(queue="web") +def remove_build_commands_storage_paths(paths): + """Remove the build commands from storage for the given paths.""" + log.info("Removing paths from build commands storage.", paths=paths) + try: + build_commands_storage.delete_paths(paths) + except Exception: + log.info("Failed to delete build commands from storage.", exc_info=True) diff --git a/readthedocs/builds/tests/test_tasks.py b/readthedocs/builds/tests/test_tasks.py index 385f79ecd4f..0ff73911d94 100644 --- a/readthedocs/builds/tests/test_tasks.py +++ b/readthedocs/builds/tests/test_tasks.py @@ -11,6 +11,7 @@ from readthedocs.builds.constants import ( BRANCH, BUILD_STATE_FINISHED, + BUILD_STATE_TRIGGERED, EXTERNAL, EXTERNAL_VERSION_STATE_CLOSED, EXTERNAL_VERSION_STATE_OPEN, @@ -22,6 +23,7 @@ archive_builds_task, check_and_disable_project_for_consecutive_failed_builds, delete_closed_external_versions, + delete_old_build_objects, post_build_overview, remove_orphan_build_config, ) @@ -100,7 +102,7 @@ def test_delete_closed_external_versions(self): self.assertFalse(Version.objects.filter(slug="external-inactive-old").exists()) @override_settings(RTD_SAVE_BUILD_COMMANDS_TO_STORAGE=True) - @mock.patch("readthedocs.builds.tasks.build_commands_storage") + @mock.patch("readthedocs.builds.models.build_commands_storage") def test_archive_builds(self, build_commands_storage): project = get(Project) version = get(Version, project=project) @@ -124,7 +126,7 @@ def test_archive_builds(self, build_commands_storage): self.assertEqual(Build.objects.count(), 10) self.assertEqual(BuildCommandResult.objects.count(), 100) - archive_builds_task.delay(days=5, delete=True) + archive_builds_task.delay(days=5) self.assertEqual(len(build_commands_storage.save.mock_calls), 5) self.assertEqual(Build.objects.count(), 10) @@ -268,6 +270,185 @@ def test_remove_orphan_build_config_no_orphans(self): assert BuildConfig.objects.filter(pk=config_2.pk).exists() +class TestDeleteOldBuildObjects(TestCase): + def _create_old_build(self, project, version, days_old, state=BUILD_STATE_FINISHED, cold_storage=False): + """Create a build with a specific age.""" + build = get( + Build, + project=project, + version=version, + state=state, + cold_storage=cold_storage, + ) + # Override the auto-set date with a specific old date. + Build.objects.filter(pk=build.pk).update( + date=timezone.now() - timezone.timedelta(days=days_old) + ) + return build + + def test_old_builds_beyond_keep_recent_are_deleted(self): + """Builds older than `days` and beyond `keep_recent` per version are deleted.""" + project = get(Project) + version = project.versions.get(slug=LATEST) + + # Create 5 old builds with different ages so ordering is deterministic. + old_builds = [self._create_old_build(project, version, days_old=400 + i) for i in range(5)] + + assert Build.objects.filter(version=version).count() == 5 + + # Keep 2 recent builds per version, so 3 should be deleted. + delete_old_build_objects(days=365, keep_recent=2, start=0) + + assert Build.objects.filter(version=version).count() == 2 + # The 2 most-recently-dated builds (those with the smallest days_old) should be kept. + remaining_pks = set( + Build.objects.filter(version=version).values_list("pk", flat=True) + ) + assert remaining_pks == {old_builds[0].pk, old_builds[1].pk} + + def test_recent_builds_are_not_deleted(self): + """Builds newer than `days` threshold are never deleted.""" + project = get(Project) + version = project.versions.get(slug=LATEST) + + # Create builds that are recent (within 'days' cutoff). + for _ in range(5): + get(Build, project=project, version=version, state=BUILD_STATE_FINISHED) + + assert Build.objects.filter(version=version).count() == 5 + + delete_old_build_objects(days=365, keep_recent=2, start=0) + + # None should be deleted since they are all recent. + assert Build.objects.filter(version=version).count() == 5 + + def test_builds_within_keep_recent_are_not_deleted(self): + """The most recent `keep_recent` builds per version are never deleted, even if old.""" + project = get(Project) + version = project.versions.get(slug=LATEST) + + # Create 3 old builds. + for _ in range(3): + self._create_old_build(project, version, days_old=400) + + assert Build.objects.filter(version=version).count() == 3 + + # keep_recent=5 means all 3 should be preserved. + delete_old_build_objects(days=365, keep_recent=5, start=0) + + assert Build.objects.filter(version=version).count() == 3 + + def test_non_final_state_builds_not_deleted(self): + """Builds in non-final states (e.g. triggered) are never deleted.""" + project = get(Project) + version = project.versions.get(slug=LATEST) + + # Create old builds in a non-final state. + for _ in range(5): + self._create_old_build(project, version, days_old=400, state=BUILD_STATE_TRIGGERED) + + assert Build.objects.filter(version=version).count() == 5 + + delete_old_build_objects(days=365, keep_recent=0, start=0) + + # Non-final builds should not be deleted. + assert Build.objects.filter(version=version).count() == 5 + + def test_versionless_builds_deleted(self): + """Old builds without a version are also deleted, beyond `keep_recent` per project.""" + project = get(Project) + + # Create 5 old versionless builds with different ages so ordering is deterministic. + old_builds = [ + self._create_old_build(project, version=None, days_old=400 + i) for i in range(5) + ] + + assert Build.objects.filter(project=project, version=None).count() == 5 + + delete_old_build_objects(days=365, keep_recent=2, start=0) + + # 3 should be deleted (keeping only 2 most recent). + assert Build.objects.filter(project=project, version=None).count() == 2 + remaining_pks = set( + Build.objects.filter(project=project, version=None).values_list("pk", flat=True) + ) + assert remaining_pks == {old_builds[0].pk, old_builds[1].pk} + + def test_limit_stops_deletion(self): + """Deletion stops once `limit` builds have been deleted.""" + project = get(Project) + version = project.versions.get(slug=LATEST) + + for _ in range(10): + self._create_old_build(project, version, days_old=400) + + assert Build.objects.filter(version=version).count() == 10 + + # With limit=3 and keep_recent=0, only 3 builds should be deleted. + delete_old_build_objects(days=365, keep_recent=0, limit=3, start=0) + + assert Build.objects.filter(version=version).count() == 7 + + def test_max_projects_limits_projects_processed(self): + """Only `max_projects` projects are processed per execution.""" + project1 = get(Project) + project2 = get(Project) + version1 = project1.versions.get(slug=LATEST) + version2 = project2.versions.get(slug=LATEST) + + for _ in range(5): + self._create_old_build(project1, version1, days_old=400) + for _ in range(5): + self._create_old_build(project2, version2, days_old=400) + + # Only process 1 project at a time (max_projects=1). + delete_old_build_objects(days=365, keep_recent=0, max_projects=1, start=0) + + total_remaining = ( + Build.objects.filter(version=version1).count() + + Build.objects.filter(version=version2).count() + ) + # Only one project's builds were processed. + assert total_remaining == 5 + + def test_keeps_builds_per_version_independently(self): + """keep_recent applies independently to each version.""" + project = get(Project) + version1 = project.versions.get(slug=LATEST) + version2 = get(Version, project=project, slug="stable") + + for _ in range(5): + self._create_old_build(project, version1, days_old=400) + for _ in range(5): + self._create_old_build(project, version2, days_old=400) + + delete_old_build_objects(days=365, keep_recent=2, start=0) + + # Each version should retain 2 builds. + assert Build.objects.filter(version=version1).count() == 2 + assert Build.objects.filter(version=version2).count() == 2 + + @mock.patch("readthedocs.builds.tasks.build_commands_storage") + def test_cold_storage_paths_are_deleted(self, build_commands_storage): + """Cold storage paths of deleted builds are removed.""" + project = get(Project) + version = project.versions.get(slug=LATEST) + + # Create old builds in cold storage. + for _ in range(3): + self._create_old_build(project, version, days_old=400, cold_storage=True) + # Also create 2 recent builds in cold storage (should NOT be deleted). + for _ in range(2): + get(Build, project=project, version=version, state=BUILD_STATE_FINISHED, cold_storage=True) + + delete_old_build_objects(days=365, keep_recent=0, start=0) + + # All 3 old builds should have been deleted. + assert Build.objects.filter(version=version).count() == 2 + # Storage delete should have been called for each deleted build's path. + build_commands_storage.delete_paths.assert_called() + + @override_settings( PRODUCTION_DOMAIN="readthedocs.org", PUBLIC_DOMAIN="readthedocs.io", diff --git a/readthedocs/core/tests/test_db_utils.py b/readthedocs/core/tests/test_db_utils.py index 36ce3414177..2bb85f63fc5 100644 --- a/readthedocs/core/tests/test_db_utils.py +++ b/readthedocs/core/tests/test_db_utils.py @@ -142,7 +142,7 @@ def test_delete_with_limit_smaller_than_total(self): # Delete only 10 projects with a limit queryset = Project.objects.filter(slug__startswith="limit-project-") - total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=3, limit=10) + total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=3, end=10) # Should delete exactly 10 projects and their related objects assert deleted_counter["projects.Project"] == 10 @@ -157,7 +157,7 @@ def test_delete_with_limit_larger_than_total(self): # Set limit larger than actual count queryset = Project.objects.filter(slug__startswith="over-limit-") - total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=2, limit=100) + total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=2, end=100) # Should delete all 5 projects assert deleted_counter["projects.Project"] == 5 @@ -171,21 +171,21 @@ def test_delete_with_limit_equal_to_batch_size(self): # Set limit equal to batch_size queryset = Project.objects.filter(slug__startswith="equal-limit-") - total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=5, limit=5) + total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=5, end=5) # Should delete exactly 5 projects assert deleted_counter["projects.Project"] == 5 assert Project.objects.filter(slug__startswith="equal-limit-").count() == 5 def test_delete_with_limit_one(self): - """Test deleting with limit=1 (edge case).""" + """Test deleting with end=1 (edge case).""" # Create 5 projects for i in range(5): get(Project, slug=f"one-limit-{i}") # Set limit to 1 queryset = Project.objects.filter(slug__startswith="one-limit-") - total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=2, limit=1) + total_deleted, deleted_counter = delete_in_batches(queryset, batch_size=2, end=1) # Should delete exactly 1 project and its related objects assert deleted_counter["projects.Project"] == 1 @@ -278,7 +278,7 @@ def test_raw_delete_with_limit_smaller_than_total(self): # Delete only 10 versions with a limit queryset = Version.objects.filter(slug__startswith="raw-limit-") - raw_delete_in_batches(queryset, batch_size=3, limit=10) + raw_delete_in_batches(queryset, batch_size=3, end=10) # Should have 10 versions remaining assert Version.objects.filter(slug__startswith="raw-limit-").count() == 10 @@ -292,7 +292,7 @@ def test_raw_delete_with_limit_larger_than_total(self): # Set limit larger than actual count queryset = Version.objects.filter(slug__startswith="raw-over-") - raw_delete_in_batches(queryset, batch_size=2, limit=100) + raw_delete_in_batches(queryset, batch_size=2, end=100) # Should delete all 5 versions assert Version.objects.filter(slug__startswith="raw-over-").count() == 0 @@ -306,13 +306,13 @@ def test_raw_delete_with_limit_equal_to_batch_size(self): # Set limit equal to batch_size queryset = Version.objects.filter(slug__startswith="raw-eq-lim-") - raw_delete_in_batches(queryset, batch_size=5, limit=5) + raw_delete_in_batches(queryset, batch_size=5, end=5) # Should have 5 versions remaining assert Version.objects.filter(slug__startswith="raw-eq-lim-").count() == 5 def test_raw_delete_with_limit_one(self): - """Test raw deleting with limit=1 (edge case).""" + """Test raw deleting with end=1 (edge case).""" # Create a project with 5 versions project = get(Project, slug="raw-one-limit") for i in range(5): @@ -320,7 +320,7 @@ def test_raw_delete_with_limit_one(self): # Set limit to 1 queryset = Version.objects.filter(slug__startswith="raw-one-") - raw_delete_in_batches(queryset, batch_size=2, limit=1) + raw_delete_in_batches(queryset, batch_size=2, end=1) # Should have 4 versions remaining assert Version.objects.filter(slug__startswith="raw-one-").count() == 4 diff --git a/readthedocs/core/utils/db.py b/readthedocs/core/utils/db.py index 640539d1fc2..ee56e429563 100644 --- a/readthedocs/core/utils/db.py +++ b/readthedocs/core/utils/db.py @@ -2,7 +2,9 @@ from itertools import batched -def delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> tuple[int, dict]: +def delete_in_batches( + queryset, batch_size=50, start: int | None = None, end: int | None = None +) -> tuple[int, dict]: """ Delete a queryset in batches to avoid long transactions or big queries. @@ -20,12 +22,15 @@ def delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> tupl :param queryset: Django queryset to delete :param batch_size: Number of records to delete per batch - :param limit: Maximum number of records to delete from the queryset + :param start: Starting index of the records to delete from the queryset + :param end: Ending index of the records to delete from the queryset """ - if limit is not None and limit <= 0: + if end is not None and start is None: + start = 0 + if end is not None and end <= 0: return 0, {} - if not limit: + if not end: # Don't use batch deletion if the number of records # is smaller or equal to the batch size. # Only do this check if no limit is set, @@ -42,8 +47,8 @@ def delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> tupl # We can't use a limit or offset with .delete, # so we first extract the IDs and perform the deletion in another query. all_pks = queryset.values_list("pk", flat=True) - if limit: - all_pks = all_pks[:limit] + if end: + all_pks = all_pks[start:end] for batch in batched(all_pks, batch_size): total_deleted_batch, deleted_counter_batch = model.objects.filter(pk__in=batch).delete() total_deleted += total_deleted_batch @@ -51,7 +56,9 @@ def delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> tupl return total_deleted, dict(deleted_counter) -def raw_delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> int: +def raw_delete_in_batches( + queryset, batch_size=50, start: int | None = None, end: int | None = None +) -> int: """ Raw delete a queryset in batches to avoid long transactions or big queries. @@ -69,10 +76,12 @@ def raw_delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> :return: Number of deleted records """ - if limit is not None and limit <= 0: + if end is not None and start is None: + start = 0 + if end is not None and end <= 0: return 0 - if not limit: + if not end: # Don't use batch deletion if the number of records # is smaller or equal to the batch size. # Only do this check if no limit is set, @@ -89,8 +98,8 @@ def raw_delete_in_batches(queryset, batch_size=50, limit: int | None = None) -> # We can't use a limit or offset with .raw_delete, # so we first extract the IDs and perform the deletion in another query. all_pks = queryset.values_list("pk", flat=True) - if limit: - all_pks = all_pks[:limit] + if end: + all_pks = all_pks[start:end] for batch in batched(all_pks, batch_size): qs = model.objects.filter(pk__in=batch) qs._raw_delete(qs.db) diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index e54ae6571b4..37f0ffbd999 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -716,6 +716,7 @@ def save(self, *args, **kwargs): self.update_latest_version() def delete(self, *args, **kwargs): + from readthedocs.builds.tasks import remove_build_commands_storage_paths from readthedocs.projects.tasks.utils import clean_project_resources # NOTE: We use _raw_delete to avoid Django fetching all objects @@ -727,6 +728,12 @@ def delete(self, *args, **kwargs): qs = self.search_queries.all() qs._raw_delete(qs.db) + # Remove build artifacts from storage for cold storage builds. + paths_to_delete = [] + for build in self.builds.filter(cold_storage=True).iterator(): + paths_to_delete.append(build.storage_path) + remove_build_commands_storage_paths.delay(paths_to_delete) + # Remove extra resources clean_project_resources(self) diff --git a/readthedocs/projects/tests/test_models.py b/readthedocs/projects/tests/test_models.py index c7bc6bd4c8f..5813743e5b0 100644 --- a/readthedocs/projects/tests/test_models.py +++ b/readthedocs/projects/tests/test_models.py @@ -236,5 +236,5 @@ def test_number_of_queries_on_project_deletion(self): get(SearchQuery, project=self.project, version=version) get(Build, project=self.project, version=version) - with self.assertNumQueries(50): + with self.assertNumQueries(51): self.project.delete() diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index cecd430bb94..ef328fa0591 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -724,7 +724,6 @@ def BUILD_MEMORY_LIMIT(self): "kwargs": { "days": 1, "limit": 500, - "delete": True, }, }, "every-30m-delete-inactive-external-versions": { @@ -763,6 +762,19 @@ def BUILD_MEMORY_LIMIT(self): "options": {"queue": "web"}, "kwargs": {"limit": 10_000}, }, + "every-hour-delete-old-build-objects": { + "task": "readthedocs.builds.tasks.delete_old_build_objects", + # NOTE: we are running this task with a limit for now + # to don't overload the DB with many deletion queries, + # since we have lots of objects to delete + # TODO: go back to do delete without a limit after we delete the backlog of objects, + # or keep less build objects (keep_recent=100, days=360). + # It should take around 12 days to delete all the old objects on community, + # and 1 day on commercial. + "schedule": crontab(minute=0, hour="*"), + "options": {"queue": "web"}, + "kwargs": {"days": 360 * 3, "keep_recent": 250, "limit": 20_000, "max_projects": 5_000}, + }, } # Sentry diff --git a/readthedocs/storage/s3_storage.py b/readthedocs/storage/s3_storage.py index d5f98acae83..3c81a7ce0e2 100644 --- a/readthedocs/storage/s3_storage.py +++ b/readthedocs/storage/s3_storage.py @@ -10,6 +10,7 @@ # Disable abstract method because we are not overriding all the methods # pylint: disable=abstract-method from functools import cached_property +from itertools import batched from django.conf import settings from django.core.exceptions import ImproperlyConfigured @@ -23,7 +24,20 @@ from .mixins import S3PrivateBucketMixin -class S3BuildMediaStorage(OverrideHostnameMixin, BuildMediaStorageMixin, S3Boto3Storage): +class RTDS3Boto3Storage(S3Boto3Storage): + def delete_paths(self, paths): + """ + Delete multiple paths from storage in batches. + + S3 has a limit of 1000 objects per delete request, so we batch the deletions accordingly. + See https://docs.aws.amazon.com/boto3/latest/reference/services/s3/bucket/delete_objects.html#S3.Bucket.delete_objects. + """ + for batch in batched(paths, 1000): + objects = [{"Key": path} for path in batch] + self.bucket.delete_objects(Delete={"Objects": objects, "Quiet": True}) + + +class S3BuildMediaStorage(OverrideHostnameMixin, BuildMediaStorageMixin, RTDS3Boto3Storage): """An AWS S3 Storage backend for build artifacts.""" bucket_name = getattr(settings, "S3_MEDIA_STORAGE_BUCKET", None) @@ -53,7 +67,7 @@ def _rclone(self): ) -class S3BuildCommandsStorage(S3PrivateBucketMixin, S3Boto3Storage): +class S3BuildCommandsStorage(S3PrivateBucketMixin, RTDS3Boto3Storage): """An AWS S3 Storage backend for build commands.""" bucket_name = getattr(settings, "S3_BUILD_COMMANDS_STORAGE_BUCKET", None) @@ -85,7 +99,7 @@ def __init__(self, *args, **kwargs): # pylint: disable=too-many-ancestors class S3StaticStorage( - S3StaticStorageMixin, OverrideHostnameMixin, S3ManifestStaticStorage, S3Boto3Storage + S3StaticStorageMixin, OverrideHostnameMixin, S3ManifestStaticStorage, RTDS3Boto3Storage ): """ An AWS S3 Storage backend for static media. @@ -94,7 +108,7 @@ class S3StaticStorage( """ -class NoManifestS3StaticStorage(S3StaticStorageMixin, OverrideHostnameMixin, S3Boto3Storage): +class NoManifestS3StaticStorage(S3StaticStorageMixin, OverrideHostnameMixin, RTDS3Boto3Storage): """ Storage backend for static files used outside Django's static files. @@ -107,7 +121,7 @@ class NoManifestS3StaticStorage(S3StaticStorageMixin, OverrideHostnameMixin, S3B internal_redirect_root_path = "proxito-static" -class S3BuildToolsStorage(S3PrivateBucketMixin, S3Boto3Storage): +class S3BuildToolsStorage(S3PrivateBucketMixin, RTDS3Boto3Storage): bucket_name = getattr(settings, "S3_BUILD_TOOLS_STORAGE_BUCKET", None) def __init__(self, *args, **kwargs):