Skip to content
73 changes: 73 additions & 0 deletions readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import datetime
import fnmatch
import json
import os.path
import re
from functools import partial
from io import BytesIO

import regex
import structlog
Expand All @@ -29,6 +31,7 @@
from readthedocs.builds.constants import EXTERNAL_VERSION_STATES
from readthedocs.builds.constants import INTERNAL
from readthedocs.builds.constants import LATEST
from readthedocs.builds.constants import MAX_BUILD_COMMAND_SIZE
from readthedocs.builds.constants import PREDEFINED_MATCH_ARGS
from readthedocs.builds.constants import PREDEFINED_MATCH_ARGS_VALUES
from readthedocs.builds.constants import STABLE
Expand Down Expand Up @@ -73,6 +76,7 @@
from readthedocs.projects.ordering import ProjectItemPositionManager
from readthedocs.projects.validators import validate_build_config_file
from readthedocs.projects.version_handling import determine_stable_version
from readthedocs.storage import build_commands_storage


log = structlog.get_logger(__name__)
Expand Down Expand Up @@ -386,8 +390,15 @@ def get_absolute_url(self):
)

def delete(self, *args, **kwargs):
from readthedocs.builds.tasks import remove_build_commands_storage_paths
from readthedocs.projects.tasks.utils import clean_project_resources

# Remove build artifacts from storage for cold storage builds.
paths_to_delete = []
for build in self.builds.filter(cold_storage=True).iterator():
paths_to_delete.append(build.storage_path)
remove_build_commands_storage_paths.delay(paths_to_delete)

log.info("Removing files for version.", version_slug=self.slug)
clean_project_resources(self.project, self)
super().delete(*args, **kwargs)
Expand Down Expand Up @@ -869,6 +880,68 @@ def save(self, *args, **kwargs): # noqa
self._readthedocs_yaml_config = None
self._readthedocs_yaml_config_changed = False

def delete(self, *args, **kwargs):
# Delete from storage if the build steps are stored outside the database.
if self.cold_storage:
try:
build_commands_storage.delete(self.storage_path)
except Exception:
log.info(
"Failed to delete build commands from storage.",
build_id=self.id,
storage_path=self.storage_path,
exc_info=True,
)
return super().delete(*args, **kwargs)

def move_to_cold_storage(self):
"""
Move build steps to cold storage if they are not already there.

Build steps are removed from the database and stored in a file in the storage backend.
This is useful for old builds that are not accessed frequently, to save space in the database.
"""
from readthedocs.api.v2.serializers import BuildCommandSerializer

if self.cold_storage:
return

commands = BuildCommandSerializer(self.commands, many=True).data
if commands:
for cmd in commands:
if len(cmd["output"]) > MAX_BUILD_COMMAND_SIZE:
cmd["output"] = cmd["output"][-MAX_BUILD_COMMAND_SIZE:]
cmd["output"] = (
"\n\n"
"... (truncated) ..."
"\n\n"
"Command output too long. Truncated to last 1MB."
"\n\n" + cmd["output"]
)
log.debug("Truncating build command for build.", build_id=self.id)
output = BytesIO(json.dumps(commands).encode("utf8"))
try:
build_commands_storage.save(name=self.storage_path, content=output)
self.commands.all().delete()
except IOError:
log.exception("Cold Storage save failure")
return

self.cold_storage = True
self.save()

@property
def storage_path(self):
"""
Storage path where the build commands will be stored.

The path is in the format: <date>/<build_id>.json

Example: 2024-01-01/1111.json
"""
date = self.date.date()
return f"{date}/{self.id}.json"

def get_absolute_url(self):
return reverse("builds_detail", args=[self.project.slug, self.pk])

Expand Down
118 changes: 88 additions & 30 deletions readthedocs/builds/tasks.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,28 @@
import json
from io import BytesIO

import requests
import structlog
from django.conf import settings
from django.core.cache import cache
from django.urls import reverse
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
from oauthlib.oauth2.rfc6749.errors import InvalidGrantError
from oauthlib.oauth2.rfc6749.errors import TokenExpiredError

from readthedocs import __version__
from readthedocs.api.v2.serializers import BuildCommandSerializer
from readthedocs.api.v2.utils import delete_versions_from_db
from readthedocs.api.v2.utils import get_deleted_active_versions
from readthedocs.api.v2.utils import run_version_automation_rules
from readthedocs.api.v2.utils import sync_versions_to_db
from readthedocs.builds.constants import BRANCH
from readthedocs.builds.constants import BUILD_FINAL_STATES
from readthedocs.builds.constants import BUILD_STATUS_FAILURE
from readthedocs.builds.constants import BUILD_STATUS_PENDING
from readthedocs.builds.constants import BUILD_STATUS_SUCCESS
from readthedocs.builds.constants import EXTERNAL
from readthedocs.builds.constants import EXTERNAL_VERSION_STATE_CLOSED
from readthedocs.builds.constants import LOCK_EXPIRE
from readthedocs.builds.constants import MAX_BUILD_COMMAND_SIZE
from readthedocs.builds.constants import TAG
from readthedocs.builds.models import Build
from readthedocs.builds.models import BuildConfig
Expand All @@ -32,6 +31,7 @@
from readthedocs.builds.utils import memcache_lock
from readthedocs.core.utils import send_email
from readthedocs.core.utils import trigger_build
from readthedocs.core.utils.db import delete_in_batches
from readthedocs.integrations.models import HttpExchange
from readthedocs.notifications.models import Notification
from readthedocs.oauth.notifications import MESSAGE_OAUTH_BUILD_STATUS_FAILURE
Expand All @@ -45,7 +45,7 @@


@app.task(queue="web", bind=True)
def archive_builds_task(self, days=14, limit=200, delete=False):
def archive_builds_task(self, days=14, limit=200):
"""
Task to archive old builds to cold storage.

Expand All @@ -71,33 +71,8 @@ def archive_builds_task(self, days=14, limit=200, delete=False):
.prefetch_related("commands")
.only("date", "cold_storage")[:limit]
)

for build in queryset:
commands = BuildCommandSerializer(build.commands, many=True).data
if commands:
for cmd in commands:
if len(cmd["output"]) > MAX_BUILD_COMMAND_SIZE:
cmd["output"] = cmd["output"][-MAX_BUILD_COMMAND_SIZE:]
cmd["output"] = (
"\n\n"
"... (truncated) ..."
"\n\n"
"Command output too long. Truncated to last 1MB."
"\n\n" + cmd["output"]
) # noqa
log.debug("Truncating build command for build.", build_id=build.id)
output = BytesIO(json.dumps(commands).encode("utf8"))
filename = "{date}/{id}.json".format(date=str(build.date.date()), id=build.id)
try:
build_commands_storage.save(name=filename, content=output)
if delete:
build.commands.all().delete()
except IOError:
log.exception("Cold Storage save failure")
continue

build.cold_storage = True
build.save()
build.move_to_cold_storage()


@app.task(queue="web")
Expand Down Expand Up @@ -606,3 +581,86 @@ def remove_orphan_build_config():
count = orphan_buildconfigs.count()
orphan_buildconfigs.delete()
log.info("Removed orphan BuildConfig objects.", count=count)


@app.task(queue="web")
def delete_old_build_objects(
days=360 * 3, keep_recent=250, limit=20_000, max_projects=5_000, start=None
):
"""
Delete old Build objects that are not needed anymore.

We keep the most recent `keep_recent` builds per version,
and delete builds that are older than `days` days.

:param limit: Maximum number of builds to delete in one execution.
Keep our DB from being overwhelmed by deleting too many builds at once.
:param max_projects: Maximum number of projects to process in one execution.
Avoids iterating over all projects in one execution.
:param start: Starting index for processing projects.
Normally, this value comes from the cache, and is updated after each execution to continue from where it left off.
But it can also be set to a specific value for manual execution.
"""
cache_key = "rtd-task:delete_old_build_objects_start"
max_count = Project.objects.count()
use_cache = start is None
if use_cache:
start = cache.get(cache_key, 0)
if start >= max_count:
start = 0
end = start + max_projects
if use_cache:
cache.set(cache_key, end)

cutoff_date = timezone.now() - timezone.timedelta(days=days)
projects = Project.objects.all().order_by("pub_date").only("pk")[start:end]
for project in projects:
# Delete builds associated with versions, keeping
# the most recent `keep_recent` builds per version.
for version in project.versions.exclude(builds=None).only("pk").iterator():
builds_to_delete = version.builds.filter(
state__in=BUILD_FINAL_STATES,
date__lt=cutoff_date,
).order_by("-date")
limit -= _delete_builds(builds_to_delete, start=keep_recent, end=keep_recent + limit)
if limit <= 0:
Comment on lines +621 to +626
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are not taking the slice here? I looks weird that we are adding all the builds to builds_to_delete but we are filtering them again and not deleting them all.

Suggested change
builds_to_delete = version.builds.filter(
state__in=BUILD_FINAL_STATES,
date__lt=cutoff_date,
).order_by("-date")
limit -= _delete_builds(builds_to_delete, start=keep_recent, end=keep_recent + limit)
if limit <= 0:
builds_to_delete = version.builds.filter(
state__in=BUILD_FINAL_STATES,
date__lt=cutoff_date,
).order_by("-date")[keep_recent:keep_recent + limit]
limit -= _delete_builds(builds_to_delete)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer if we perform the slice here since it will be clearer and avoid confusions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deleting in batches doesn't work over a sliced queryset. It's explained in the PR description.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, but this is pretty confusing to me. We are calling _delete_builds(builds_to_delete) but the result is that those builds are not deleted? They are filtered instead and just a few of them are deleted? 🤔

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, we are calling _delete_builds(queryset, start, end), that seems explicit?

return

# Delete builds that are not associated with any version,
# keeping the most recent `keep_recent` builds per project.
builds_to_delete = project.builds.filter(
version=None, state__in=BUILD_FINAL_STATES, date__lt=cutoff_date
).order_by("-date")
limit -= _delete_builds(builds_to_delete, start=keep_recent, end=keep_recent + limit)
if limit <= 0:
return
Comment on lines +629 to +636
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the builds are not associated to any version, this is because the version was deleted. We shouldn't keep only keep_recent for all of those builds here, but keep_recent builds per build.version_slug instead to be consistent with the formula.

Copy link
Member Author

@stsewd stsewd Mar 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth it complicating this more? If the version is deleted, the build is never referenced by any version. We could keep 2x or 3x the builds here if we want to keep more builds, but honestly I think we are fine here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are we saving this builds for if they are not reference by any version?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So their builds page wouldn't immediately 404, and users had a history of previous builds. I'll be fine deleting builds when a version is deleted.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think we either want:

  1. delete these build objects, OR
  2. apply the same formula we are applying to the other builds

but I wouldn't create another logic for this particular use case.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but I wouldn't create another logic for this particular use case.

It's simpler and faster to apply a global limit than grouping the query for versions and then delete builds for each one of them...



def _delete_builds(builds, start: int, end: int) -> int:
"""
Delete builds from the queryset, starting from `start` index and ending at `end` index.

This also deletes the storage paths associated with the builds if they are in cold storage.

:returns: The number of builds deleted.
"""
# NOTE: we can't use a filter over an sliced queryset.
paths_to_delete = []
for build in builds[start:end]:
if build.cold_storage:
paths_to_delete.append(build.storage_path)

_, deleted = delete_in_batches(builds, start=start, end=end)
remove_build_commands_storage_paths(paths_to_delete)
return deleted.get("builds.Build", 0)


@app.task(queue="web")
def remove_build_commands_storage_paths(paths):
"""Remove the build commands from storage for the given paths."""
log.info("Removing paths from build commands storage.", paths=paths)
for path in paths:
try:
build_commands_storage.delete(path)
except Exception:
log.info("Failed to delete build commands from storage.", path=path, exc_info=True)
Loading