pytorch · NicolasHug · Feb 28, 2025 · Feb 25, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/test/test_datasets.py b/test/test_datasets.py
@@ -24,6 +24,7 @@
 import torch.nn.functional as F
 from common_utils import combinations_grid
 from torchvision import datasets
+from torchvision.io import decode_image
 from torchvision.transforms import v2
 
 
@@ -1175,6 +1176,8 @@ class SBUTestCase(datasets_utils.ImageDatasetTestCase):
  DATASET_CLASS = datasets.SBU
  FEATURE_TYPES = (PIL.Image.Image, str)
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  def inject_fake_data(self, tmpdir, config):
  num_images = 3
 
@@ -1413,6 +1416,8 @@ class Flickr8kTestCase(datasets_utils.ImageDatasetTestCase):
  _IMAGES_FOLDER = "images"
  _ANNOTATIONS_FILE = "captions.html"
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  def dataset_args(self, tmpdir, config):
  tmpdir = pathlib.Path(tmpdir)
  root = tmpdir / self._IMAGES_FOLDER
@@ -1482,6 +1487,8 @@ class Flickr30kTestCase(Flickr8kTestCase):
 
  _ANNOTATIONS_FILE = "captions.token"
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  def _image_file_name(self, idx):
  return f"{idx}.jpg"
 
@@ -1942,6 +1949,8 @@ class LFWPeopleTestCase(datasets_utils.DatasetTestCase):
  _IMAGES_DIR = {"original": "lfw", "funneled": "lfw_funneled", "deepfunneled": "lfw-deepfunneled"}
  _file_id = {"10fold": "", "train": "DevTrain", "test": "DevTest"}
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  def inject_fake_data(self, tmpdir, config):
  tmpdir = pathlib.Path(tmpdir) / "lfw-py"
  os.makedirs(tmpdir, exist_ok=True)
@@ -1978,6 +1987,18 @@ def _create_random_id(self):
  part2 = datasets_utils.create_random_string(random.randint(4, 7))
  return f"{part1}_{part2}"
 
+ def test_tv_decode_image_support(self):
+ if not self.SUPPORT_TV_IMAGE_DECODE:
+ pytest.skip(f"{self.DATASET_CLASS.__name__} does not support torchvision.io.decode_image.")
+
+ with self.create_dataset(
+ config=dict(
+ loader=decode_image,
+ )
+ ) as (dataset, _):
+ image = dataset[0][0]
+ assert isinstance(image, torch.Tensor)
+
 
 class LFWPairsTestCase(LFWPeopleTestCase):
  DATASET_CLASS = datasets.LFWPairs
@@ -2335,6 +2356,8 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase):
 
  ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  def inject_fake_data(self, tmpdir: str, config):
  root_folder = pathlib.Path(tmpdir) / "food-101"
  image_folder = root_folder / "images"
@@ -2371,6 +2394,7 @@ class FGVCAircraftTestCase(datasets_utils.ImageDatasetTestCase):
  ADDITIONAL_CONFIGS = combinations_grid(
  split=("train", "val", "trainval", "test"), annotation_level=("variant", "family", "manufacturer")
  )
+ SUPPORT_TV_IMAGE_DECODE = True
 
  def inject_fake_data(self, tmpdir: str, config):
  split = config["split"]
@@ -2420,6 +2444,8 @@ def inject_fake_data(self, tmpdir: str, config):
 class SUN397TestCase(datasets_utils.ImageDatasetTestCase):
  DATASET_CLASS = datasets.SUN397
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  def inject_fake_data(self, tmpdir: str, config):
  data_dir = pathlib.Path(tmpdir) / "SUN397"
  data_dir.mkdir()
@@ -2451,6 +2477,8 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase):
  DATASET_CLASS = datasets.DTD
  FEATURE_TYPES = (PIL.Image.Image, int)
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  ADDITIONAL_CONFIGS = combinations_grid(
  split=("train", "test", "val"),
  # There is no need to test the whole matrix here, since each fold is treated exactly the same
@@ -2611,6 +2639,7 @@ class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase):
  FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
 
  ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
+ SUPPORT_TV_IMAGE_DECODE = True
 
  def inject_fake_data(self, tmpdir, config):
  data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0"
@@ -2708,6 +2737,8 @@ class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
  REQUIRED_PACKAGES = ("scipy",)
  ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  def inject_fake_data(self, tmpdir, config):
  import scipy.io as io
  from numpy.core.records import fromarrays
@@ -2782,6 +2813,8 @@ class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
  ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
  REQUIRED_PACKAGES = ("scipy",)
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  def inject_fake_data(self, tmpdir: str, config):
  base_folder = pathlib.Path(tmpdir) / "flowers-102"
 
@@ -2840,6 +2873,8 @@ class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
  ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
  SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"}
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  def inject_fake_data(self, tmpdir: str, config):
  root_folder = pathlib.Path(tmpdir) / "rendered-sst2"
  image_folder = root_folder / self.SPLIT_TO_FOLDER[config["split"]]
@@ -3500,6 +3535,8 @@ class ImagenetteTestCase(datasets_utils.ImageDatasetTestCase):
  DATASET_CLASS = datasets.Imagenette
  ADDITIONAL_CONFIGS = combinations_grid(split=["train", "val"], size=["full", "320px", "160px"])
 
+ SUPPORT_TV_IMAGE_DECODE = True
+
  _WNIDS = [
  "n01440764",
  "n02102040",

diff --git a/torchvision/datasets/clevr.py b/torchvision/datasets/clevr.py
@@ -3,7 +3,7 @@
 from typing import Any, Callable, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
-from PIL import Image
+from .folder import default_loader
 
 from .utils import download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
@@ -18,11 +18,14 @@ class CLEVRClassification(VisionDataset):
  root (str or ``pathlib.Path``): Root directory of dataset where directory ``root/clevr`` exists or will be saved to if download is
  set to True.
  split (string, optional): The dataset split, supports ``"train"`` (default), ``"val"``, or ``"test"``.
- transform (callable, optional): A function/transform that takes in a PIL image and returns a transformed
- version. E.g, ``transforms.RandomCrop``
+ transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+ and returns a transformed version. E.g, ``transforms.RandomCrop``
  target_transform (callable, optional): A function/transform that takes in them target and transforms it.
  download (bool, optional): If true, downloads the dataset from the internet and puts it in root directory. If
  dataset is already downloaded, it is not downloaded again.
+ loader (callable, optional): A function to load an image given its path.
+ By default, it uses PIL as its image loader, but users could also pass in
+ ``torchvision.io.decode_image`` for decoding image data into tensors directly.
  """
 
  _URL = "https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip"
@@ -35,9 +38,11 @@ def __init__(
  transform: Optional[Callable] = None,
  target_transform: Optional[Callable] = None,
  download: bool = False,
+ loader: Callable[[Union[str, pathlib.Path]], Any] = default_loader,
  ) -> None:
  self._split = verify_str_arg(split, "split", ("train", "val", "test"))
  super().__init__(root, transform=transform, target_transform=target_transform)
+ self.loader = loader
  self._base_folder = pathlib.Path(self.root) / "clevr"
  self._data_folder = self._base_folder / pathlib.Path(urlparse(self._URL).path).stem
 
@@ -65,7 +70,7 @@ def __getitem__(self, idx: int) -> Tuple[Any, Any]:
  image_file = self._image_files[idx]
  label = self._labels[idx]
 
- image = Image.open(image_file).convert("RGB")
+ image = self.loader(image_file)
 
  if self.transform:
  image = self.transform(image)

diff --git a/torchvision/datasets/country211.py b/torchvision/datasets/country211.py
@@ -16,8 +16,8 @@ class Country211(ImageFolder):
  Args:
  root (str or ``pathlib.Path``): Root directory of the dataset.
  split (string, optional): The dataset split, supports ``"train"`` (default), ``"valid"`` and ``"test"``.
- transform (callable, optional): A function/transform that takes in a PIL image and returns a transformed
- version. E.g, ``transforms.RandomCrop``.
+ transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+ and returns a transformed version. E.g, ``transforms.RandomCrop``
  target_transform (callable, optional): A function/transform that takes in the target and transforms it.
  download (bool, optional): If True, downloads the dataset from the internet and puts it into
  ``root/country211/``. If dataset is already downloaded, it is not downloaded again.

diff --git a/torchvision/datasets/dtd.py b/torchvision/datasets/dtd.py
@@ -2,7 +2,7 @@
 import pathlib
 from typing import Any, Callable, Optional, Tuple, Union
 
-import PIL.Image
+from .folder import default_loader
 
 from .utils import download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
@@ -21,12 +21,15 @@ class DTD(VisionDataset):
  The partition only changes which split each image belongs to. Thus, regardless of the selected
  partition, combining all splits will result in all images.
 
- transform (callable, optional): A function/transform that takes in a PIL image and returns a transformed
- version. E.g, ``transforms.RandomCrop``.
+ transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
+ and returns a transformed version. E.g, ``transforms.RandomCrop``
  target_transform (callable, optional): A function/transform that takes in the target and transforms it.
  download (bool, optional): If True, downloads the dataset from the internet and
  puts it in root directory. If dataset is already downloaded, it is not
  downloaded again. Default is False.
+ loader (callable, optional): A function to load an image given its path.
+ By default, it uses PIL as its image loader, but users could also pass in
+ ``torchvision.io.decode_image`` for decoding image data into tensors directly.
  """
 
  _URL = "https://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz"
@@ -40,6 +43,7 @@ def __init__(
  transform: Optional[Callable] = None,
  target_transform: Optional[Callable] = None,
  download: bool = False,
+ loader: Callable[[Union[str, pathlib.Path]], Any] = default_loader,
  ) -> None:
  self._split = verify_str_arg(split, "split", ("train", "val", "test"))
  if not isinstance(partition, int) and not (1 <= partition <= 10):
@@ -72,13 +76,14 @@ def __init__(
  self.classes = sorted(set(classes))
  self.class_to_idx = dict(zip(self.classes, range(len(self.classes))))
  self._labels = [self.class_to_idx[cls] for cls in classes]
+ self.loader = loader
 
  def __len__(self) -> int:
  return len(self._image_files)
 
  def __getitem__(self, idx: int) -> Tuple[Any, Any]:
  image_file, label = self._image_files[idx], self._labels[idx]
- image = PIL.Image.open(image_file).convert("RGB")
+ image = self.loader(image_file)
 
  if self.transform:
  image = self.transform(image)

diff --git a/torchvision/datasets/eurosat.py b/torchvision/datasets/eurosat.py
@@ -14,7 +14,7 @@ class EuroSAT(ImageFolder):
 
  Args:
  root (str or ``pathlib.Path``): Root directory of dataset where ``root/eurosat`` exists.
- transform (callable, optional): A function/transform that takes in a PIL image
+ transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
  and returns a transformed version. E.g, ``transforms.RandomCrop``
  target_transform (callable, optional): A function/transform that takes in the
  target and transforms it.

diff --git a/torchvision/datasets/fgvc_aircraft.py b/torchvision/datasets/fgvc_aircraft.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import Any, Callable, Optional, Tuple, Union
 
-import PIL.Image
+from .folder import default_loader
 
 from .utils import download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
@@ -29,13 +29,16 @@ class FGVCAircraft(VisionDataset):
  ``trainval`` and ``test``.
  annotation_level (str, optional): The annotation level, supports ``variant``,
  ``family`` and ``manufacturer``.
- transform (callable, optional): A function/transform that takes in a PIL image
+ transform (callable, optional): A function/transform that takes in a PIL image or torch.Tensor, depends on the given loader,
  and returns a transformed version. E.g, ``transforms.RandomCrop``
  target_transform (callable, optional): A function/transform that takes in the
  target and transforms it.
  download (bool, optional): If True, downloads the dataset from the internet and
  puts it in root directory. If dataset is already downloaded, it is not
  downloaded again.
+ loader (callable, optional): A function to load an image given its path.
+ By default, it uses PIL as its image loader, but users could also pass in
+ ``torchvision.io.decode_image`` for decoding image data into tensors directly.
  """
 
  _URL = "https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz"
@@ -48,6 +51,7 @@ def __init__(
  transform: Optional[Callable] = None,
  target_transform: Optional[Callable] = None,
  download: bool = False,
+ loader: Callable[[str], Any] = default_loader,
  ) -> None:
  super().__init__(root, transform=transform, target_transform=target_transform)
  self._split = verify_str_arg(split, "split", ("train", "val", "trainval", "test"))
@@ -87,13 +91,14 @@ def __init__(
  image_name, label_name = line.strip().split(" ", 1)
  self._image_files.append(os.path.join(image_data_folder, f"{image_name}.jpg"))
  self._labels.append(self.class_to_idx[label_name])
+ self.loader = loader
 
  def __len__(self) -> int:
  return len(self._image_files)
 
  def __getitem__(self, idx: int) -> Tuple[Any, Any]:
  image_file, label = self._image_files[idx], self._labels[idx]
- image = PIL.Image.open(image_file).convert("RGB")
+ image = self.loader(image_file)
 
  if self.transform:
  image = self.transform(image)