From 18d28ea04098d811acf33559968277e4409079d5 Mon Sep 17 00:00:00 2001 From: Srinivas Lade Date: Mon, 8 Jun 2026 13:31:58 -0400 Subject: [PATCH 1/4] add droid dataset --- daft/datasets/__init__.py | 3 +- daft/datasets/droid.py | 153 +++++++++++++++++++++++++++++++++++ tests/datasets/test_droid.py | 67 +++++++++++++++ 3 files changed, 222 insertions(+), 1 deletion(-) create mode 100644 daft/datasets/droid.py create mode 100644 tests/datasets/test_droid.py diff --git a/daft/datasets/__init__.py b/daft/datasets/__init__.py index f817abae8d0..2dc2d2dad4e 100644 --- a/daft/datasets/__init__.py +++ b/daft/datasets/__init__.py @@ -1,3 +1,4 @@ from daft.datasets.common_crawl import common_crawl +from daft.datasets import droid -__all__ = ["common_crawl"] +__all__ = ["common_crawl", "droid"] diff --git a/daft/datasets/droid.py b/daft/datasets/droid.py new file mode 100644 index 00000000000..2e18a31b9a5 --- /dev/null +++ b/daft/datasets/droid.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import daft +from daft.api_annotations import PublicAPI +from daft.datatype import DataType +from daft.expressions import col, lit +from daft.functions import ( + file, + format, + regexp_replace, + unnest, + video_file, +) +from daft.io import GCSConfig, IOConfig + +if TYPE_CHECKING: + from daft.dataframe import DataFrame + + +_METADATA_DTYPE = DataType.struct( + { + "uuid": DataType.string, + "lab": DataType.string, + "user": DataType.string, + "user_id": DataType.string, + "date": DataType.date, + "timestamp": DataType.string, + "hdf5_path": DataType.string, + "building": DataType.string, + "scene_id": DataType.int64, + "success": DataType.bool, + "robot_serial": DataType.string, + "r2d2_version": DataType.string, + "current_task": DataType.string, + "trajectory_length": DataType.int64, + "wrist_cam_serial": DataType.string, + "ext1_cam_serial": DataType.string, + "ext2_cam_serial": DataType.string, + "wrist_cam_extrinsics": DataType.list(DataType.float64), + "ext1_cam_extrinsics": DataType.list(DataType.float64), + "ext2_cam_extrinsics": DataType.list(DataType.float64), + "wrist_svo_path": DataType.string, + "wrist_mp4_path": DataType.string, + "ext1_svo_path": DataType.string, + "ext1_mp4_path": DataType.string, + "ext2_svo_path": DataType.string, + "ext2_mp4_path": DataType.string, + "left_mp4_path": DataType.string, + "right_mp4_path": DataType.string, + } +) + + +@PublicAPI +def raw( + # By default, use the official public GCS bucket + path: str = "gs://gresearch/robotics/droid_raw", + io_config: IOConfig | None = None, + # TODO: Add support for stereo videos + # include_stereo: bool = False, + # TODO: Add support for SVO camera recordings +) -> DataFrame: + r"""Load the raw DROID robotics dataset as a lazy episode-level DataFrame. + + This function discovers episodes by globbing ``metadata_*.json`` files under the + provided dataset root, reads the episode metadata, and attaches lazy file references + to the per-episode trajectory HDF5 file and MP4 camera recordings. + + Each row corresponds to one DROID episode with the following layout on disk: + + episode/ + |---- metadata_.json # Episode metadata like building ID, data collector ID etc. + |---- trajectory.h5 # All low-dimensional information like action and proprioception trajectories. + |---- recordings/ + |---- MP4/ + |---- .mp4 + |---- -stereo.mp4 # Optional stereo views. + |---- SVO/ + |---- .svo # Raw ZED SVO file with encoded camera recording information (contains some additional metadata) + + Args: + path: Root path to the raw DROID dataset. Defaults to the official public + GCS release at `gs://gresearch/robotics/droid_raw`. Also supports + local paths and other remote object stores. + io_config: IO configuration for accessing remote storage. + + Returns: + A DataFrame with one row per episode. Metadata fields from each episode's JSON + file are stored in the `metadata` struct column, along with: + + - `episode_dir`: path to the episode directory + - `metadata.*`: metadata fields parsed from the metadata JSON file + - `trajectory`: lazy `daft.File` reference to the trajectory HDF5 file + - `wrist_video`: lazy `daft.VideoFile` reference to the wrist camera MP4 file + - `ext1_video`: lazy `daft.VideoFile` reference to the external camera 1 MP4 file + Often the left camera feed. + - `ext2_video`: lazy `daft.VideoFile` reference to the external camera 2 MP4 file + Often the right camera feed. + + Examples: + >>> import daft + >>> df = daft.datasets.droid.raw() # doctest: +SKIP + >>> df.select("episode_dir", "ext1_video").show() # doctest: +SKIP + """ + # Configure IO config with anonymous access to the public GCS bucket + if io_config is None: + io_config = IOConfig(gcs=GCSConfig(anonymous=True)) + + episodes = ( + daft.from_glob_path(f"{path.rstrip('/')}/**/metadata_*.json", io_config=io_config) + .select( + col("path") + .download(io_config=io_config) + .cast(DataType.string()) + .try_deserialize("json", _METADATA_DTYPE) + .alias("metadata"), + regexp_replace(col("path"), r"/metadata_[^/]+\.json$", "").alias("episode_dir"), + ) + .select(unnest(col("metadata")), "episode_dir") + ) + + # Create a file column for the trajectory HDF5 file + episodes = episodes.with_column( + "trajectory", + file(format("{}/{}", col("episode_dir"), lit("trajectory.h5")), io_config=io_config), + ) + + # Create VideoFile columns for MP4 camera recordings + episodes = ( + episodes.with_column( + "wrist_video", + video_file(format("{}/{}.mp4", col("episode_dir"), col("wrist_cam_serial")), io_config=io_config), + ) + .with_column( + "ext1_video", + video_file(format("{}/{}.mp4", col("episode_dir"), col("ext1_cam_serial")), io_config=io_config), + ) + .with_column( + "ext2_video", + video_file(format("{}/{}.mp4", col("episode_dir"), col("ext2_cam_serial")), io_config=io_config), + ) + ) + + return episodes + + +# TODO: Add a custom expression to read & parse the trajectory HDF5 file + +__all__ = [ + "raw", +] diff --git a/tests/datasets/test_droid.py b/tests/datasets/test_droid.py new file mode 100644 index 00000000000..d1c4f1e5d9b --- /dev/null +++ b/tests/datasets/test_droid.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import pytest + +import daft +from daft import DataType, MediaType +from daft.expressions import col + +pytestmark = pytest.mark.integration() + +DROID_RAW_GCS_PREFIX = "gs://gresearch/robotics/droid_raw" + + +@pytest.fixture(scope="module") +def droid_raw_df(): + return daft.datasets.droid.raw() + + +def test_droid_discovers_episodes_and_metadata(droid_raw_df) -> None: + result = droid_raw_df.select("uuid", "building", "success").limit(1).to_pydict() + + assert len(result["uuid"]) == 1 + assert isinstance(result["uuid"][0], str) and result["uuid"][0] + assert isinstance(result["building"][0], str) and result["building"][0] + assert isinstance(result["success"][0], bool) + + +def test_droid_unnests_metadata_columns(droid_raw_df) -> None: + schema = {field.name: field.dtype for field in droid_raw_df.schema()} + assert "building" in schema + assert "success" in schema + assert "metadata" not in schema + assert schema["building"] == DataType.string + assert schema["success"] == DataType.bool + + +def test_droid_adds_trajectory_and_video_file_columns(droid_raw_df) -> None: + schema = {field.name: field.dtype for field in droid_raw_df.schema()} + assert schema["trajectory"] == DataType.file() + assert schema["wrist_video"] == DataType.file(MediaType.video()) + assert schema["ext1_video"] == DataType.file(MediaType.video()) + assert schema["ext2_video"] == DataType.file(MediaType.video()) + + result = ( + droid_raw_df.select( + "episode_dir", + "wrist_cam_serial", + "ext1_cam_serial", + "ext2_cam_serial", + col("trajectory").file_path().alias("trajectory_path"), + col("wrist_video").file_path().alias("wrist_video_path"), + col("ext1_video").file_path().alias("ext1_video_path"), + col("ext2_video").file_path().alias("ext2_video_path"), + ) + .limit(1) + .to_pydict() + ) + + episode_dir = result["episode_dir"][0] + assert episode_dir.startswith(f"{DROID_RAW_GCS_PREFIX}/") + + trajectory_path = result["trajectory_path"][0] + assert trajectory_path == f"{episode_dir}/trajectory.h5" + + assert result["wrist_video_path"][0] == f"{episode_dir}/{result['wrist_cam_serial'][0]}.mp4" + assert result["ext1_video_path"][0] == f"{episode_dir}/{result['ext1_cam_serial'][0]}.mp4" + assert result["ext2_video_path"][0] == f"{episode_dir}/{result['ext2_cam_serial'][0]}.mp4" From c4eccd033eee3f62a2f46e7d840d7b9f83f814bb Mon Sep 17 00:00:00 2001 From: Srinivas Lade Date: Wed, 17 Jun 2026 11:15:45 -0700 Subject: [PATCH 2/4] add docs --- docs/SUMMARY.md | 1 + docs/api/datasets.md | 11 +++- docs/api/index.md | 2 +- docs/datasets/droid.md | 140 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 docs/datasets/droid.md diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 248663db1e2..7de3bf59023 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -31,6 +31,7 @@ * [Batch Inference](use-case/batch-inference.md) * Datasets * [Common Crawl](datasets/common-crawl.md) + * [DROID](datasets/droid.md) * Data Connectors * [Overview](connectors/index.md) * Object Storage diff --git a/docs/api/datasets.md b/docs/api/datasets.md index 57cc969eb85..be818571a7f 100644 --- a/docs/api/datasets.md +++ b/docs/api/datasets.md @@ -1,6 +1,6 @@ # Datasets -Daft provides simple, performant, and responsible ways to access useful datasets like [Common Crawl](https://commoncrawl.org/get-started). +Daft provides simple, performant, and responsible ways to access useful datasets like [Common Crawl](https://commoncrawl.org/get-started) and [DROID](https://droid-dataset.github.io/). ## Common Crawl @@ -10,3 +10,12 @@ Check out our [Common Crawl dataset guide](../datasets/common-crawl.md) for more options: filters: ["!^_"] heading_level: 3 + +## DROID + +Check out our [DROID dataset guide](../datasets/droid.md) for more examples! + +::: daft.datasets.droid.raw + options: + filters: ["!^_"] + heading_level: 3 diff --git a/docs/api/index.md b/docs/api/index.md index da73d616880..f3c3dd79326 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -19,7 +19,7 @@ Welcome to Daft Python API Documentation. For Daft User Guide, head to [User Gui - [**Datasets**](datasets.md) - Simple, performant, and responsible ways to access useful datasets like Common Crawl. + Simple, performant, and responsible ways to access useful datasets like Common Crawl and DROID. - [**Expressions**](expressions.md) diff --git a/docs/datasets/droid.md b/docs/datasets/droid.md new file mode 100644 index 00000000000..14fce7e02fd --- /dev/null +++ b/docs/datasets/droid.md @@ -0,0 +1,140 @@ +# How to use DROID with Daft + +[DROID](https://droid-dataset.github.io/) (Distributed Robot Interaction Dataset) is one of the most popular large-scale, in-the-wild robot manipulation dataset with 76,000 demonstration trajectories and 350 hours of interaction data. It was collected across 564 scenes and 86 tasks using the Franka Panda robot platform, and includes synchronized RGB camera streams, camera calibration, and natural language task descriptions. + +Daft provides a simple way to explore the raw DROID release as a lazy, episode-level DataFrame with metadata, trajectory files, and camera videos attached as [`daft.VideoFile`](../modalities/videos.md) columns. + +## Prerequisites + +The raw DROID dataset is hosted on Google Cloud Storage at `gs://gresearch/robotics/droid_raw` (~8.7 TB). By default, [`daft.datasets.droid.raw()`][daft.datasets.droid.raw] reads from this public bucket, so no credentials are required to get started. + +If you prefer to work with a local copy, download episodes with [`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) and pass the local path to `raw()`: + +```bash +# Download the full raw dataset (~8.7 TB) +gsutil -m cp -r gs://gresearch/robotics/droid_raw /path/to/droid_raw + +# Or download a smaller subset for development +gsutil -m cp -r gs://gresearch/robotics/droid_raw/ /path/to/droid_raw/ +``` + +See the [official DROID dataset documentation](https://droid-dataset.github.io/droid/the-droid-dataset) for details on the dataset format and downloading the necessary files for your use case. + +## Quickstart + +The simplest way to get started is to load a small sample of data from the public source: + +```python +import daft + +# Load a sample of the raw DROID data +daft.datasets.droid.raw().show() +``` + +Each row corresponds to one DROID episode. Metadata from each episode's JSON file is unnested into top-level columns, and lazy file references are attached for the trajectory HDF5 file and three MP4 camera recordings. + +## Basic usage + +### Loading from the public GCS bucket + +This is the default behavior. Daft globs `metadata_*.json` files under the dataset root, reads each episode's metadata, and constructs paths to the associated trajectory and video files: + +```python +import daft + +df = daft.datasets.droid.raw() +``` + +### Loading from a local or custom path + +Point `path` at any directory that mirrors the raw DROID layout (see [Episode layout](#episode-layout) below): + +```python +import daft + +df = daft.datasets.droid.raw(path="/path/to/droid_raw") +``` + +Remote object stores other than GCS are also supported when passed via `path` with an appropriate [`IOConfig`][daft.io.IOConfig]. + +### Loading a subset of episodes + +Because `raw()` returns a lazy DataFrame, you can filter, project, and sample before materializing any video or trajectory data: + +```python +import daft + +( + daft.datasets.droid.raw() + .where(daft.col("success")) + .where(daft.col("building") == "Ross") + .select("uuid", "current_task", "trajectory_length", "wrist_video") + .limit(10) +) +``` + +## Episode layout + +Each DROID episode is stored in its own directory: + +``` +episode/ +|---- metadata_.json # Episode metadata (building, task, camera serials, etc.) +|---- trajectory.h5 # Low-dimensional action and proprioception trajectories +|---- recordings/ + |---- MP4/ + |---- .mp4 + |---- -stereo.mp4 # Optional stereo views + |---- SVO/ + |---- .svo # Raw ZED SVO recordings +``` + +[`daft.datasets.droid.raw()`][daft.datasets.droid.raw] currently attaches lazy references to: + +- `trajectory`: the episode's `trajectory.h5` file +- `wrist_video`: wrist camera MP4 +- `ext1_video`: external camera 1 MP4 (often the left view) +- `ext2_video`: external camera 2 MP4 (often the right view) + +Stereo MP4 and raw SVO recordings are not yet exposed as columns. + +## Data schema + +`raw()` returns one row per episode with metadata fields unnested from each `metadata_*.json` file, plus the following key columns: + +| Column | Type | Description | +| --- | --- | --- | +| `episode_dir` | String | Path to the episode directory | +| `uuid` | String | Unique episode identifier | +| `lab` | String | Collecting lab | +| `user` | String | Data collector name | +| `user_id` | String | Data collector identifier | +| `date` | Date | Collection date | +| `timestamp` | String | Collection timestamp | +| `building` | String | Building or environment name | +| `scene_id` | Int64 | Scene identifier within the building | +| `success` | Boolean | Whether the demonstration was successful | +| `current_task` | String | Natural language task description | +| `trajectory_length` | Int64 | Number of timesteps in the trajectory | +| `robot_serial` | String | Robot hardware serial number | +| `wrist_cam_serial` | String | Wrist camera serial number | +| `ext1_cam_serial` | String | External camera 1 serial number | +| `ext2_cam_serial` | String | External camera 2 serial number | +| `wrist_cam_extrinsics` | List[Float64] | Wrist camera extrinsics | +| `ext1_cam_extrinsics` | List[Float64] | External camera 1 extrinsics | +| `ext2_cam_extrinsics` | List[Float64] | External camera 2 extrinsics | +| `trajectory` | File | Lazy reference to `trajectory.h5` | +| `wrist_video` | VideoFile | Lazy reference to the wrist camera MP4 | +| `ext1_video` | VideoFile | Lazy reference to external camera 1 MP4 | +| `ext2_video` | VideoFile | Lazy reference to external camera 2 MP4 | + +Additional path columns from the metadata JSON (such as `hdf5_path`, `wrist_mp4_path`, and `ext1_mp4_path`) are also available as top-level columns. + + + +## Next steps + +- See the [Videos modality guide](../modalities/videos.md) for decoding frames with [`video_frames`][daft.functions.video_frames] and working with [`daft.VideoFile`](../api/datatypes/file_types.md). +- See the [Files modality guide](../modalities/files.md) for reading trajectory HDF5 files with [`daft.File`](../api/datatypes/file_types.md). +- Visit the [official DROID project page](https://droid-dataset.github.io/) for hardware setup, policy learning code, and additional dataset formats. +- See the [DROID Dataset API reference](../api/datasets.md#droid) for complete parameter documentation. From c06e38ad5daf86a5106c70ce24647c55216b0ab8 Mon Sep 17 00:00:00 2001 From: Srinivas Lade Date: Wed, 17 Jun 2026 12:32:10 -0700 Subject: [PATCH 3/4] clean up a bit --- daft/datasets/droid.py | 33 +++++++++++++++++++++++++-------- daft/functions/file_.py | 2 +- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/daft/datasets/droid.py b/daft/datasets/droid.py index 2e18a31b9a5..cc507d078d9 100644 --- a/daft/datasets/droid.py +++ b/daft/datasets/droid.py @@ -5,7 +5,7 @@ import daft from daft.api_annotations import PublicAPI from daft.datatype import DataType -from daft.expressions import col, lit +from daft.expressions import col from daft.functions import ( file, format, @@ -19,6 +19,8 @@ from daft.dataframe import DataFrame +_PUBLIC_GCS_BUCKET = "gs://gresearch/robotics/droid_raw" + _METADATA_DTYPE = DataType.struct( { "uuid": DataType.string, @@ -56,8 +58,10 @@ @PublicAPI def raw( # By default, use the official public GCS bucket - path: str = "gs://gresearch/robotics/droid_raw", + path: str = _PUBLIC_GCS_BUCKET, io_config: IOConfig | None = None, + *, + verify_videos: bool = True, # TODO: Add support for stereo videos # include_stereo: bool = False, # TODO: Add support for SVO camera recordings @@ -85,6 +89,7 @@ def raw( GCS release at `gs://gresearch/robotics/droid_raw`. Also supports local paths and other remote object stores. io_config: IO configuration for accessing remote storage. + verify_videos: Whether to verify that the video files exist and are valid. Defaults to True. Returns: A DataFrame with one row per episode. Metadata fields from each episode's JSON @@ -105,7 +110,7 @@ def raw( >>> df.select("episode_dir", "ext1_video").show() # doctest: +SKIP """ # Configure IO config with anonymous access to the public GCS bucket - if io_config is None: + if io_config is None and path == _PUBLIC_GCS_BUCKET: io_config = IOConfig(gcs=GCSConfig(anonymous=True)) episodes = ( @@ -113,7 +118,7 @@ def raw( .select( col("path") .download(io_config=io_config) - .cast(DataType.string()) + .cast(DataType.string) .try_deserialize("json", _METADATA_DTYPE) .alias("metadata"), regexp_replace(col("path"), r"/metadata_[^/]+\.json$", "").alias("episode_dir"), @@ -124,22 +129,34 @@ def raw( # Create a file column for the trajectory HDF5 file episodes = episodes.with_column( "trajectory", - file(format("{}/{}", col("episode_dir"), lit("trajectory.h5")), io_config=io_config), + file(format("{}/trajectory.h5", col("episode_dir")), io_config=io_config), ) # Create VideoFile columns for MP4 camera recordings episodes = ( episodes.with_column( "wrist_video", - video_file(format("{}/{}.mp4", col("episode_dir"), col("wrist_cam_serial")), io_config=io_config), + video_file( + format("{}/recordings/MP4/{}.mp4", col("episode_dir"), col("wrist_cam_serial")), + io_config=io_config, + verify=verify_videos, + ), ) .with_column( "ext1_video", - video_file(format("{}/{}.mp4", col("episode_dir"), col("ext1_cam_serial")), io_config=io_config), + video_file( + format("{}/recordings/MP4/{}.mp4", col("episode_dir"), col("ext1_cam_serial")), + io_config=io_config, + verify=verify_videos, + ), ) .with_column( "ext2_video", - video_file(format("{}/{}.mp4", col("episode_dir"), col("ext2_cam_serial")), io_config=io_config), + video_file( + format("{}/recordings/MP4/{}.mp4", col("episode_dir"), col("ext2_cam_serial")), + io_config=io_config, + verify=verify_videos, + ), ) ) diff --git a/daft/functions/file_.py b/daft/functions/file_.py index 87294b881d4..cf578c30d70 100644 --- a/daft/functions/file_.py +++ b/daft/functions/file_.py @@ -13,7 +13,7 @@ def file(url: Expression, io_config: IOConfig | None = None) -> Expression: """Converts a string containing a file reference to a `daft.File` reference. Args: - url (StringExpression): the url of the file + url (String Expression): the url of the file io_config (IOConfig, default=None): The IO configuration to use. Returns: From 6a5587a0990238063dd7e0e48298f9271c105523 Mon Sep 17 00:00:00 2001 From: Srinivas Lade Date: Fri, 19 Jun 2026 13:38:19 -0700 Subject: [PATCH 4/4] clean up --- daft/datasets/droid.py | 51 +++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/daft/datasets/droid.py b/daft/datasets/droid.py index cc507d078d9..0be82c89074 100644 --- a/daft/datasets/droid.py +++ b/daft/datasets/droid.py @@ -5,13 +5,15 @@ import daft from daft.api_annotations import PublicAPI from daft.datatype import DataType -from daft.expressions import col +from daft.expressions import col, lit from daft.functions import ( file, + file_exists, format, regexp_replace, unnest, video_file, + when, ) from daft.io import GCSConfig, IOConfig @@ -60,8 +62,6 @@ def raw( # By default, use the official public GCS bucket path: str = _PUBLIC_GCS_BUCKET, io_config: IOConfig | None = None, - *, - verify_videos: bool = True, # TODO: Add support for stereo videos # include_stereo: bool = False, # TODO: Add support for SVO camera recordings @@ -72,17 +72,9 @@ def raw( provided dataset root, reads the episode metadata, and attaches lazy file references to the per-episode trajectory HDF5 file and MP4 camera recordings. - Each row corresponds to one DROID episode with the following layout on disk: - - episode/ - |---- metadata_.json # Episode metadata like building ID, data collector ID etc. - |---- trajectory.h5 # All low-dimensional information like action and proprioception trajectories. - |---- recordings/ - |---- MP4/ - |---- .mp4 - |---- -stereo.mp4 # Optional stereo views. - |---- SVO/ - |---- .svo # Raw ZED SVO file with encoded camera recording information (contains some additional metadata) + Note: + The public dataset is missing camera recordings for some episodes. Those that are missing + will be set to `None`. Args: path: Root path to the raw DROID dataset. Defaults to the official public @@ -130,34 +122,33 @@ def raw( episodes = episodes.with_column( "trajectory", file(format("{}/trajectory.h5", col("episode_dir")), io_config=io_config), + ).with_column( + "trajectory", + when(file_exists(col("trajectory")), col("trajectory")).otherwise(lit(None)), ) # Create VideoFile columns for MP4 camera recordings - episodes = ( - episodes.with_column( - "wrist_video", - video_file( + episodes = episodes.with_columns( + { + "wrist_video": video_file( format("{}/recordings/MP4/{}.mp4", col("episode_dir"), col("wrist_cam_serial")), io_config=io_config, - verify=verify_videos, ), - ) - .with_column( - "ext1_video", - video_file( + "ext1_video": video_file( format("{}/recordings/MP4/{}.mp4", col("episode_dir"), col("ext1_cam_serial")), io_config=io_config, - verify=verify_videos, ), - ) - .with_column( - "ext2_video", - video_file( + "ext2_video": video_file( format("{}/recordings/MP4/{}.mp4", col("episode_dir"), col("ext2_cam_serial")), io_config=io_config, - verify=verify_videos, ), - ) + } + ).with_columns( + { + "wrist_video": when(file_exists(col("wrist_video")), col("wrist_video")).otherwise(lit(None)), + "ext1_video": when(file_exists(col("ext1_video")), col("ext1_video")).otherwise(lit(None)), + "ext2_video": when(file_exists(col("ext2_video")), col("ext2_video")).otherwise(lit(None)), + } ) return episodes