pytorch
diff --git a/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/io.rst‎
Lines changed: 61 additions & 46 deletions b/‎docs/source/io.rst‎
Lines changed: 61 additions & 46 deletions
diff --git a/‎packaging/windows/internal/vc_env_helper.bat‎
Lines changed: 2 additions & 0 deletions b/‎packaging/windows/internal/vc_env_helper.bat‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎torchvision/io/image.py‎
Lines changed: 38 additions & 42 deletions b/‎torchvision/io/image.py‎
Lines changed: 38 additions & 42 deletions
diff --git a/‎torchvision/io/video.py‎
Lines changed: 24 additions & 0 deletions b/‎torchvision/io/video.py‎
Lines changed: 24 additions & 0 deletions
@@ -25,6 +25,7 @@ jobs:
  os: windows
  test-infra-repository: pytorch/test-infra
  test-infra-ref: main
+ with-xpu: enable
  build:
  needs: generate-matrix
  strategy:
 
@@ -3,33 +3,46 @@ Decoding / Encoding images and videos
 
 .. currentmodule:: torchvision.io
 
-The :mod:`torchvision.io` package provides functions for performing IO
-operations. They are currently specific to reading and writing images and
-videos.
+The :mod:`torchvision.io` module provides utilities for decoding and encoding
+images and videos.
 
-Images
-------
+Image Decoding
+--------------
 
 Torchvision currently supports decoding JPEG, PNG, WEBP and GIF images. JPEG
 decoding can also be done on CUDA GPUs.
 
-For encoding, JPEG (cpu and CUDA) and PNG are supported.
+The main entry point is the :func:`~torchvision.io.decode_image` function, which
+you can use as an alternative to ``PIL.Image.open()``. It will decode images
+straight into image Tensors, thus saving you the conversion and allowing you to
+run transforms/preproc natively on tensors.
+
+.. code::
+
+ from torchvision.io import decode_image
+
+ img = decode_image("path_to_image", mode="RGB")
+ img.dtype # torch.uint8
+
+ # Or
+ raw_encoded_bytes = ... # read encoded bytes from your file system
+ img = decode_image(raw_encoded_bytes, mode="RGB")
+
+
+:func:`~torchvision.io.decode_image` will automatically detect the image format,
+and call the corresponding decoder. You can also use the lower-level
+format-specific decoders which can be more powerful, e.g. if you want to
+encode/decode JPEGs on CUDA.
 
 .. autosummary::
  :toctree: generated/
  :template: function.rst
 
  decode_image
- encode_jpeg
  decode_jpeg
- write_jpeg
+ encode_png
  decode_gif
  decode_webp
- encode_png
- decode_png
- write_png
- read_file
- write_file
 
 .. autosummary::
  :toctree: generated/
@@ -41,14 +54,47 @@ Obsolete decoding function:
 
 .. autosummary::
  :toctree: generated/
- :template: class.rst
+ :template: function.rst
 
  read_image
 
+Image Encoding
+--------------
+
+For encoding, JPEG (cpu and CUDA) and PNG are supported.
+
+
+.. autosummary::
+ :toctree: generated/
+ :template: function.rst
+
+ encode_jpeg
+ write_jpeg
+ encode_png
+ write_png
+
+IO operations
+-------------
+
+.. autosummary::
+ :toctree: generated/
+ :template: function.rst
+
+ read_file
+ write_file
 
 Video
 -----
 
+.. warning::
+
+ Torchvision supports video decoding through different APIs listed below,
+ some of which are still in BETA stage. In the near future, we intend to
+ centralize PyTorch's video decoding capabilities within the `torchcodec
+ <https://github.com/pytorch/torchcodec>`_ project. We encourage you to try
+ it out and share your feedback, as the torchvision video decoders will
+ eventually be deprecated.
+
 .. autosummary::
  :toctree: generated/
  :template: function.rst
@@ -58,45 +104,14 @@ Video
  write_video
 
 
-Fine-grained video API
-^^^^^^^^^^^^^^^^^^^^^^
+**Fine-grained video API**
 
 In addition to the :mod:`read_video` function, we provide a high-performance 
 lower-level API for more fine-grained control compared to the :mod:`read_video` function.
 It does all this whilst fully supporting torchscript.
 
-.. betastatus:: fine-grained video API
-
 .. autosummary::
  :toctree: generated/
  :template: class.rst
 
  VideoReader
-
-
-Example of inspecting a video:
-
-.. code:: python
-
- import torchvision
- video_path = "path to a test video"
- # Constructor allocates memory and a threaded decoder
- # instance per video. At the moment it takes two arguments:
- # path to the video file, and a wanted stream.
- reader = torchvision.io.VideoReader(video_path, "video")
-
- # The information about the video can be retrieved using the 
- # `get_metadata()` method. It returns a dictionary for every stream, with
- # duration and other relevant metadata (often frame rate)
- reader_md = reader.get_metadata()
-
- # metadata is structured as a dict of dicts with following structure
- # {"stream_type": {"attribute": [attribute per stream]}}
- #
- # following would print out the list of frame rates for every present video stream
- print(reader_md["video"]["fps"])
-
- # we explicitly select the stream we would like to operate on. In
- # the constructor we select a default video stream, but
- # in practice, we can set whichever stream we would like 
- video.set_current_stream("video:0")
 
@@ -28,6 +28,8 @@ if "%VSDEVCMD_ARGS%" == "" (
 
 @echo on
 
+if "%CU_VERSION%" == "xpu" call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+
 set DISTUTILS_USE_SDK=1
 
 set args=%1
 
@@ -20,19 +20,25 @@
 
 
 class ImageReadMode(Enum):
- """
- Support for various modes while reading images.
+ """Allow automatic conversion to RGB, RGBA, etc while decoding.
+
+ .. note::
+
+ You don't need to use this struct, you can just pass strings to all
+ ``mode`` parameters, e.g. ``mode="RGB"``.
 
- Use ``ImageReadMode.UNCHANGED`` for loading the image as-is,
- ``ImageReadMode.GRAY`` for converting to grayscale,
- ``ImageReadMode.GRAY_ALPHA`` for grayscale with transparency,
- ``ImageReadMode.RGB`` for RGB and ``ImageReadMode.RGB_ALPHA`` for
- RGB with transparency.
+ The different available modes are the following.
+
+ - UNCHANGED: loads the image as-is
+ - RGB: converts to RGB
+ - RGBA: converts to RGB with transparency (also aliased as RGB_ALPHA)
+ - GRAY: converts to grayscale
+ - GRAY_ALPHA: converts to grayscale with transparency
 
  .. note::
 
- Some decoders won't support all possible values, e.g. a decoder may only
- support "RGB" and "RGBA" mode.
+ Some decoders won't support all possible values, e.g. GRAY and
+ GRAY_ALPHA are only supported for PNG and JPEG images.
  """
 
  UNCHANGED = 0
@@ -45,8 +51,7 @@ class ImageReadMode(Enum):
 
 def read_file(path: str) -> torch.Tensor:
  """
- Reads and outputs the bytes contents of a file as a uint8 Tensor
- with one dimension.
+ Return the bytes contents of a file as a uint8 1D Tensor.
 
  Args:
  path (str or ``pathlib.Path``): the path to the file to be read
@@ -62,8 +67,7 @@ def read_file(path: str) -> torch.Tensor:
 
 def write_file(filename: str, data: torch.Tensor) -> None:
  """
- Writes the contents of an uint8 tensor with one dimension to a
- file.
+ Write the content of an uint8 1D tensor to a file.
 
  Args:
  filename (str or ``pathlib.Path``): the path to the file to be written
@@ -93,10 +97,9 @@ def decode_png(
  Args:
  input (Tensor[1]): a one dimensional uint8 tensor containing
  the raw bytes of the PNG image.
- mode (str or ImageReadMode): the read mode used for optionally
- converting the image. Default: ``ImageReadMode.UNCHANGED``.
- See `ImageReadMode` class for more information on various
- available modes.
+ mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+ Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode`
+ for available modes.
  apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
  Default: False.
 
@@ -156,8 +159,7 @@ def decode_jpeg(
  device: Union[str, torch.device] = "cpu",
  apply_exif_orientation: bool = False,
 ) -> Union[torch.Tensor, List[torch.Tensor]]:
- """
- Decode JPEG image(s) into 3 dimensional RGB or grayscale Tensor(s).
+ """Decode JPEG image(s) into 3D RGB or grayscale Tensor(s), on CPU or CUDA.
 
  The values of the output tensor are uint8 between 0 and 255.
 
@@ -171,12 +173,9 @@ def decode_jpeg(
  input (Tensor[1] or list[Tensor[1]]): a (list of) one dimensional uint8 tensor(s) containing
  the raw bytes of the JPEG image. The tensor(s) must be on CPU,
  regardless of the ``device`` parameter.
- mode (str or ImageReadMode): the read mode used for optionally
- converting the image(s). The supported modes are: ``ImageReadMode.UNCHANGED``,
- ``ImageReadMode.GRAY`` and ``ImageReadMode.RGB``
- Default: ``ImageReadMode.UNCHANGED``.
- See ``ImageReadMode`` class for more information on various
- available modes.
+ mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+ Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode`
+ for available modes.
  device (str or torch.device): The device on which the decoded image will
  be stored. If a cuda device is specified, the image will be decoded
  with `nvjpeg <https://developer.nvidia.com/nvjpeg>`_. This is only
@@ -228,9 +227,7 @@ def decode_jpeg(
 def encode_jpeg(
  input: Union[torch.Tensor, List[torch.Tensor]], quality: int = 75
 ) -> Union[torch.Tensor, List[torch.Tensor]]:
- """
- Takes a (list of) input tensor(s) in CHW layout and returns a (list of) buffer(s) with the contents
- of the corresponding JPEG file(s).
+ """Encode RGB tensor(s) into raw encoded jpeg bytes, on CPU or CUDA.
 
  .. note::
  Passing a list of CUDA tensors is more efficient than repeated individual calls to ``encode_jpeg``.
@@ -286,7 +283,7 @@ def decode_image(
  mode: ImageReadMode = ImageReadMode.UNCHANGED,
  apply_exif_orientation: bool = False,
 ) -> torch.Tensor:
- """Decode an image into a tensor.
+ """Decode an image into a uint8 tensor, from a path or from raw encoded bytes.
 
  Currently supported image formats are jpeg, png, gif and webp.
 
@@ -303,10 +300,9 @@ def decode_image(
  input (Tensor or str or ``pathlib.Path``): The image to decode. If a
  tensor is passed, it must be one dimensional uint8 tensor containing
  the raw bytes of the image. Otherwise, this must be a path to the image file.
- mode (str or ImageReadMode): the read mode used for optionally converting the image.
- Default: ``ImageReadMode.UNCHANGED``.
- See ``ImageReadMode`` class for more information on various
- available modes. Only applies to JPEG and PNG images.
+ mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+ Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode`
+ for available modes.
  apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
  Only applies to JPEG and PNG images. Default: False.
 
@@ -367,9 +363,9 @@ def decode_webp(
  Args:
  input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
  the raw bytes of the WEBP image.
- mode (str or ImageReadMode): The read mode used for optionally
- converting the image color space. Default: ``ImageReadMode.UNCHANGED``.
- Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``.
+ mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+ Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode`
+ for available modes.
 
  Returns:
  Decoded image (Tensor[image_channels, image_height, image_width])
@@ -398,9 +394,9 @@ def _decode_avif(
  Args:
  input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
  the raw bytes of the AVIF image.
- mode (str or ImageReadMode): The read mode used for optionally
- converting the image color space. Default: ``ImageReadMode.UNCHANGED``.
- Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``.
+ mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+ Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode`
+ for available modes.
 
  Returns:
  Decoded image (Tensor[image_channels, image_height, image_width])
@@ -426,9 +422,9 @@ def _decode_heic(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHAN
  Args:
  input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
  the raw bytes of the HEIC image.
- mode (str or ImageReadMode): The read mode used for optionally
- converting the image color space. Default: ``ImageReadMode.UNCHANGED``.
- Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``.
+ mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+ Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode`
+ for available modes.
 
  Returns:
  Decoded image (Tensor[image_channels, image_height, image_width])
 
@@ -64,6 +64,14 @@ def write_video(
  """
  Writes a 4d tensor in [T, H, W, C] format in a video file
 
+ .. warning::
+
+ In the near future, we intend to centralize PyTorch's video decoding
+ capabilities within the `torchcodec
+ <https://github.com/pytorch/torchcodec>`_ project. We encourage you to
+ try it out and share your feedback, as the torchvision video decoders
+ will eventually be deprecated.
+
  Args:
  filename (str): path where the video will be saved
  video_array (Tensor[T, H, W, C]): tensor containing the individual frames,
@@ -243,6 +251,14 @@ def read_video(
  """
  Reads a video from a file, returning both the video frames and the audio frames
 
+ .. warning::
+
+ In the near future, we intend to centralize PyTorch's video decoding
+ capabilities within the `torchcodec
+ <https://github.com/pytorch/torchcodec>`_ project. We encourage you to
+ try it out and share your feedback, as the torchvision video decoders
+ will eventually be deprecated.
+
  Args:
  filename (str): path to the video file. If using the pyav backend, this can be whatever ``av.open`` accepts.
  start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
@@ -367,6 +383,14 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[in
  """
  List the video frames timestamps.
 
+ .. warning::
+
+ In the near future, we intend to centralize PyTorch's video decoding
+ capabilities within the `torchcodec
+ <https://github.com/pytorch/torchcodec>`_ project. We encourage you to
+ try it out and share your feedback, as the torchvision video decoders
+ will eventually be deprecated.
+
  Note that the function decodes the whole video frame-by-frame.
 
  Args: