Source code for cached_path.bytes_range

from typing import TYPE_CHECKING, Optional
from urllib.parse import urlparse

from ._cached_path import cached_path, get_from_cache
from .common import PathOrStr
from .schemes import get_scheme_client, get_supported_schemes

if TYPE_CHECKING:
    from rich.progress import Progress


[docs]def get_bytes_range(
    url_or_filename: PathOrStr,
    index: int,
    length: int,
    cache_dir: Optional[PathOrStr] = None,
    extract_archive: bool = False,
    force_extract: bool = False,
    quiet: bool = False,
    progress: Optional["Progress"] = None,
) -> bytes:
    """
    Get a range of up to ``length`` bytes starting at ``index``.

    In some cases the entire file may need to be downloaded, such as when the server does not support
    a range download or when you're trying to get a bytes range from a file within an archive.

    .. caution::
        You may get less than ``length`` bytes sometimes, such as when fetching a range from an HTTP
        resource starting at 0 since headers will be omitted in the bytes returned.

    Parameters
    ----------

    url_or_filename :
        A URL or path to parse and possibly download.

    index :
        The index of the byte to start at.

    length :
        The number of bytes to read.

    cache_dir :
        The directory to cache downloads. If not specified, the global default cache directory
        will be used (``~/.cache/cached_path``). This can be set to something else with
        :func:`set_cache_dir()`.

        This is only relevant when the bytes range cannot be obtained directly from the resource.

    extract_archive :
        Set this to ``True`` when you want to get a bytes range from a file within an archive.
        In this case the ``url_or_filename`` must contain an "!" followed by the relative path of the file
        within the archive, e.g. "s3://my-archive.tar.gz!my-file.txt".

        Note that the entire archive has to be downloaded in this case.

    force_extract :
        If ``True`` and the resource is a file within an archive (when the path contains an "!" and
        ``extract_archive=True``), it will be extracted regardless of whether or not the extracted
        directory already exists.

        .. caution::
            Use this flag with caution! This can lead to race conditions if used
            from multiple processes on the same file.

    quiet :
        If ``True``, progress displays won't be printed.

        This is only relevant when the bytes range cannot be obtained directly from the resource.

    progress :
        A custom progress display to use. If not set and ``quiet=False``, a default display
        from :func:`~cached_path.get_download_progress()` will be used.

        This is only relevant when the bytes range cannot be obtained directly from the resource.
    """
    if not isinstance(url_or_filename, str):
        url_or_filename = str(url_or_filename)

    # If we're using the /a/b/foo.zip!c/d/file.txt syntax, handle it here.
    exclamation_index = url_or_filename.find("!")
    if extract_archive and exclamation_index >= 0:
        archive_path = url_or_filename[:exclamation_index]
        file_name = url_or_filename[exclamation_index + 1 :]

        # Call 'cached_path' now to get the local path to the archive itself.
        cached_archive_path = cached_path(
            archive_path,
            cache_dir=cache_dir,
            extract_archive=True,
            force_extract=force_extract,
            quiet=quiet,
            progress=progress,
        )
        if not cached_archive_path.is_dir():
            raise ValueError(
                f"{url_or_filename} uses the ! syntax, but does not specify an archive file."
            )

        # Now load bytes from the desired file within the extracted archive, provided it exists.
        file_path = cached_archive_path / file_name
        if not file_path.exists():
            raise FileNotFoundError(f"'{file_name}' not found within '{archive_path}'")

        return _bytes_range_from_file(file_path, index, length)

    if urlparse(url_or_filename).scheme in get_supported_schemes():
        # URL, so use the scheme client.
        client = get_scheme_client(url_or_filename)

        # Check if file is already downloaded.
        try:
            cache_path, _ = get_from_cache(
                url_or_filename,
                cache_dir=cache_dir,
                quiet=quiet,
                progress=progress,
                no_downloads=True,
                _client=client,
            )
            return _bytes_range_from_file(cache_path, index, length)
        except FileNotFoundError:
            pass

        # Otherwise try streaming bytes directly.
        try:
            return client.get_bytes_range(index, length)
        except NotImplementedError:
            # fall back to downloading the whole file.
            pass

    file_path = cached_path(url_or_filename, cache_dir=cache_dir, quiet=quiet, progress=progress)
    return _bytes_range_from_file(file_path, index, length)


def _bytes_range_from_file(path: PathOrStr, index: int, length: int) -> bytes:
    with open(path, "rb") as f:
        f.seek(index)
        return f.read(length)