Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 2 additions & 206 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
@@ -1,217 +1,13 @@
from functools import cached_property
from pathlib import Path

import structlog
from django.conf import settings
from django.contrib.staticfiles.storage import StaticFilesStorage as BaseStaticFilesStorage
from django.core.exceptions import SuspiciousFileOperation
from django.core.files.storage import FileSystemStorage
from storages.utils import get_available_overwrite_name

from readthedocs.core.utils.filesystem import safe_open
from readthedocs.storage.rclone import RCloneLocal
from readthedocs.storage.utils import safe_join


log = structlog.get_logger(__name__)


class BuildMediaStorageMixin:
"""
A mixin for Storage classes needed to write build artifacts.

This adds and modifies some functionality to Django's File Storage API.
By default, classes mixing this in will now overwrite files by default instead
of finding an available name.
This mixin also adds convenience methods to copy and delete entire directories.
from readthedocs.storage.filesystem import RTDFileSystemStorage

See: https://docs.djangoproject.com/en/1.11/ref/files/storage
"""

class BuildMediaFileSystemStorage(RTDFileSystemStorage):
# Root path of the nginx internal redirect
# that will serve files from this storage.
internal_redirect_root_path = "proxito"

@staticmethod
def _dirpath(path):
"""
Make the path to end with `/`.

It may just be Azure, but for listdir to work correctly, this is needed.
"""
path = str(path)
if not path.endswith("/"):
path += "/"

return path

def get_available_name(self, name, max_length=None):
"""
Overrides Django's storage to always return the passed name (overwrite).

By default, Django will not overwrite files even if the same name is specified.
This changes that functionality so that the default is to use the same name and overwrite
rather than modify the path to not clobber files.
"""
return get_available_overwrite_name(name, max_length=max_length)

def delete_directory(self, path):
"""
Delete all files under a certain path from storage.

Many storage backends (S3, Azure storage) don't care about "directories".
The directory effectively doesn't exist if there are no files in it.
However, in these backends, there is no "rmdir" operation so you have to recursively
delete all files.

:param path: the path to the directory to remove
"""
if path in ("", "/"):
raise SuspiciousFileOperation("Deleting all storage cannot be right")

log.debug("Deleting path from media storage", path=path)
folders, files = self.listdir(self._dirpath(path))
for folder_name in folders:
if folder_name:
# Recursively delete the subdirectory
self.delete_directory(self.join(path, folder_name))
for filename in files:
if filename:
self.delete(self.join(path, filename))

def copy_directory(self, source, destination):
"""
Copy a directory recursively to storage.

:param source: the source path on the local disk
:param destination: the destination path in storage
"""
log.debug(
"Copying source directory to media storage",
source=source,
destination=destination,
)
source = Path(source)
self._check_suspicious_path(source)
for filepath in source.iterdir():
sub_destination = self.join(destination, filepath.name)

# Don't follow symlinks when uploading to storage.
if filepath.is_symlink():
log.info(
"Skipping symlink upload.",
path_resolved=str(filepath.resolve()),
)
continue

if filepath.is_dir():
# Recursively copy the subdirectory
self.copy_directory(filepath, sub_destination)
elif filepath.is_file():
with safe_open(filepath, "rb") as fd:
self.save(sub_destination, fd)

def _check_suspicious_path(self, path):
"""Check that the given path isn't a symlink or outside the doc root."""
path = Path(path)
resolved_path = path.resolve()
if path.is_symlink():
msg = "Suspicious operation over a symbolic link."
log.error(msg, path=str(path), resolved_path=str(resolved_path))
raise SuspiciousFileOperation(msg)

docroot = Path(settings.DOCROOT).absolute()
if not path.is_relative_to(docroot):
msg = "Suspicious operation outside the docroot directory."
log.error(msg, path=str(path), resolved_path=str(resolved_path))
raise SuspiciousFileOperation(msg)

@cached_property
def _rclone(self):
raise NotImplementedError

def rclone_sync_directory(self, source, destination):
"""Sync a directory recursively to storage using rclone sync."""
if destination in ("", "/"):
raise SuspiciousFileOperation("Syncing all storage cannot be right")

self._check_suspicious_path(source)
return self._rclone.sync(source, destination)

def join(self, directory, filepath):
return safe_join(directory, filepath)

def walk(self, top):
if top in ("", "/"):
raise SuspiciousFileOperation("Iterating all storage cannot be right")

log.debug("Walking path in media storage", path=top)
folders, files = self.listdir(self._dirpath(top))

yield top, folders, files

for folder_name in folders:
if folder_name:
# Recursively walk the subdirectory
yield from self.walk(self.join(top, folder_name))


class BuildMediaFileSystemStorage(BuildMediaStorageMixin, FileSystemStorage):
"""Storage subclass that writes build artifacts in PRODUCTION_MEDIA_ARTIFACTS or MEDIA_ROOT."""

def __init__(self, **kwargs):
location = kwargs.pop("location", None)

if not location:
# Mirrors the logic of getting the production media path
if settings.DEFAULT_PRIVACY_LEVEL == "public" or settings.DEBUG:
location = settings.MEDIA_ROOT
else:
location = settings.PRODUCTION_MEDIA_ARTIFACTS

super().__init__(location)

@cached_property
def _rclone(self):
return RCloneLocal(location=self.location)

def get_available_name(self, name, max_length=None):
"""
A hack to overwrite by default with the FileSystemStorage.

After upgrading to Django 2.2, this method can be removed
because subclasses can set OS_OPEN_FLAGS such that FileSystemStorage._save
will properly overwrite files.
See: https://github.com/django/django/pull/8476
"""
name = super().get_available_name(name, max_length=max_length)
if self.exists(name):
self.delete(name)
return name

def listdir(self, path):
"""
Return empty lists for nonexistent directories.

This mimics what cloud storages do.
"""
if not self.exists(path):
return [], []
return super().listdir(path)

def url(self, name, *args, **kwargs): # noqa
"""
Override to accept extra arguments and ignore them all.

This method helps us to bring compatibility between Azure Blob Storage
(which does not use the HTTP method) and Amazon S3 (who requires HTTP
method to build the signed URL).

``FileSystemStorage`` does not support any other argument than ``name``.
https://docs.djangoproject.com/en/2.2/ref/files/storage/#django.core.files.storage.Storage.url
"""
return super().url(name)


class StaticFilesStorage(BaseStaticFilesStorage):
# Root path of the nginx internal redirect
Expand Down
51 changes: 1 addition & 50 deletions readthedocs/rtd_tests/tests/test_build_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,6 @@ def assertFileTree(self, source, tree):
for folder, files in dirs_tree:
self.assertFileTree(self.storage.join(source, folder), files)

def test_copy_directory(self):
self.assertFalse(self.storage.exists("files/test.html"))

with override_settings(DOCROOT=files_dir):
self.storage.copy_directory(files_dir, "files")
self.assertTrue(self.storage.exists("files/test.html"))
self.assertTrue(self.storage.exists("files/conf.py"))
self.assertTrue(self.storage.exists("files/api.fjson"))
self.assertTrue(self.storage.exists("files/api/index.html"))
self.assertFalse(self.storage.exists("files/test-symlink.html"))
self.assertFalse(self.storage.exists("files/dir-symlink"))

def test_sync_directory(self):
tmp_files_dir = os.path.join(tempfile.mkdtemp(), "files")
shutil.copytree(files_dir, tmp_files_dir, symlinks=True)
Expand Down Expand Up @@ -100,15 +88,6 @@ def test_sync_directory_source_symlink(self):
with pytest.raises(SuspiciousFileOperation, match="symbolic link"):
self.storage.rclone_sync_directory(tmp_symlink_dir, "files")

def test_copy_directory_source_symlink(self):
tmp_dir = Path(tempfile.mkdtemp())
tmp_symlink_dir = Path(tempfile.mkdtemp()) / "files"
tmp_symlink_dir.symlink_to(tmp_dir)

with override_settings(DOCROOT=tmp_dir):
with pytest.raises(SuspiciousFileOperation, match="symbolic link"):
self.storage.copy_directory(tmp_symlink_dir, "files")

def test_sync_directory_source_outside_docroot(self):
tmp_dir = Path(tempfile.mkdtemp())
tmp_docroot = Path(tempfile.mkdtemp()) / "docroot"
Expand All @@ -118,18 +97,9 @@ def test_sync_directory_source_outside_docroot(self):
with pytest.raises(SuspiciousFileOperation, match="outside the docroot"):
self.storage.rclone_sync_directory(tmp_dir, "files")

def test_copy_directory_source_outside_docroot(self):
tmp_dir = Path(tempfile.mkdtemp())
tmp_docroot = Path(tempfile.mkdtemp()) / "docroot"
tmp_docroot.mkdir()

with override_settings(DOCROOT=tmp_docroot):
with pytest.raises(SuspiciousFileOperation, match="outside the docroot"):
self.storage.copy_directory(tmp_dir, "files")

def test_delete_directory(self):
with override_settings(DOCROOT=files_dir):
self.storage.copy_directory(files_dir, "files")
self.storage.rclone_sync_directory(files_dir, "files")
dirs, files = self.storage.listdir("files")
self.assertEqual(dirs, ["api"])
self.assertCountEqual(
Expand All @@ -147,25 +117,6 @@ def test_delete_directory(self):
self.assertEqual(dirs, [])
self.assertEqual(files, [])

def test_walk(self):
with override_settings(DOCROOT=files_dir):
self.storage.copy_directory(files_dir, "files")

output = list(self.storage.walk("files"))
self.assertEqual(len(output), 2)

top, dirs, files = output[0]
self.assertEqual(top, "files")
self.assertCountEqual(dirs, ["api"])
self.assertCountEqual(
files, ["404.html", "api.fjson", "conf.py", "index.html", "test.html"]
)

top, dirs, files = output[1]
self.assertEqual(top, "files/api")
self.assertCountEqual(dirs, [])
self.assertCountEqual(files, ["index.html"])

def test_rclone_sync(self):
tmp_files_dir = Path(tempfile.mkdtemp()) / "files"
shutil.copytree(files_dir, tmp_files_dir, symlinks=True)
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/rtd_tests/tests/test_imported_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def tearDown(self):

def _copy_storage_dir(self, version):
"""Copy the test directory (rtd_tests/files) to storage"""
self.storage.copy_directory(
self.storage.rclone_sync_directory(
self.test_dir,
self.project.get_storage_path(
type_="html",
Expand Down
12 changes: 12 additions & 0 deletions readthedocs/settings/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,24 @@ def STORAGES(self):
},
"build-media": {
"BACKEND": self.RTD_BUILD_MEDIA_STORAGE,
"OPTIONS": {
"location": self.MEDIA_ROOT,
"allow_overwrite": True,
},
},
"build-commands": {
"BACKEND": self.RTD_BUILD_COMMANDS_STORAGE,
"OPTIONS": {
"location": self.MEDIA_ROOT,
"allow_overwrite": True,
},
},
"build-tools": {
"BACKEND": self.RTD_BUILD_TOOLS_STORAGE,
"OPTIONS": {
"location": self.MEDIA_ROOT,
"allow_overwrite": True,
},
},
"usercontent": {
"BACKEND": "django.core.files.storage.FileSystemStorage",
Expand Down
60 changes: 60 additions & 0 deletions readthedocs/storage/filesystem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import shutil
from functools import cached_property

from django.core.files.storage import FileSystemStorage

from readthedocs.storage.mixins import RTDBaseStorage
from readthedocs.storage.rclone import RCloneLocal
from readthedocs.storage.utils import safe_join


class RTDFileSystemStorage(RTDBaseStorage, FileSystemStorage):
"""
Storage subclass that writes files to the local filesystem.

.. note:: This storage is used on tests only, and it is not used in production.
"""

def __init__(self, location=None, allow_overwrite=False, **kwargs):
# TODO: find a better way to not pass unknown kwargs to FileSystemStorage.
# This happens because we are creating storage instances dynamically in
# readthedocs.projects.tasks.storage and we are passing the credentials
# as kwargs, which are not used by FileSystemStorage.
# NOTE: this is used on tests only.
super().__init__(location=location, allow_overwrite=allow_overwrite)

@cached_property
def _rclone(self):
return RCloneLocal(location=self.location)

def delete_directory(self, path):
path = self.path(path)
shutil.rmtree(path, ignore_errors=True)

def url(self, name, *args, **kwargs):
"""
Override to accept extra arguments and ignore them all.

This method helps us to bring compatibility between Azure Blob Storage
(which does not use the HTTP method) and Amazon S3 (who requires HTTP
method to build the signed URL).

``FileSystemStorage`` does not support any other argument than ``name``.
https://docs.djangoproject.com/en/5.2/ref/files/storage/#django.core.files.storage.Storage.url
"""
return super().url(name)

def listdir(self, path):
"""
Return empty lists for nonexistent directories.

This mimics what cloud storages do.
"""
if not self.exists(path):
return [], []
return super().listdir(path)

def join(self, directory, filepath):
# NOTE: tests are relying on this being the one used in cloud storage backends,
# in reality, we should use ``django.utils._os.safe_join``.
return safe_join(directory, filepath)
Loading
Loading