Eventual-Inc · lhoestq · Jun 10, 2026 · Jun 10, 2026
diff --git a/daft/datasets/common_crawl.py b/daft/datasets/common_crawl.py
@@ -24,6 +24,10 @@ def _get_http_manifest_path(crawl: str, file_type: Literal["warc", "wet", "wat"]
     return f"https://data.commoncrawl.org/crawl-data/{crawl}/{file_type}.paths.gz"
 
 
+def _get_hf_manifest_path(crawl: str, file_type: Literal["warc", "wet", "wat"]) -> str:
+    return f"hf://buckets/commoncrawl/commoncrawl/crawl-data/{crawl}/{file_type}.paths.gz"
+
+
 def _unique_cc_file_paths(paths_url: str, io_config: IOConfig | None) -> DataFrame:
     # The manifest file is a gzipped plaintext file with one path per line.
     # Technically, this is equivalent to a CSV file with one column, "url", with no headers, and we could use read_csv.
@@ -47,16 +51,21 @@ def _get_common_crawl_paths(
     io_config: IOConfig | None,
     *,
     in_aws: bool,
+    hf_buckets: bool = False,
 ) -> list[str]:
     """Get the paths to the Common Crawl files for a given crawl, segment, file type. Limited by `num_files`."""
-    if in_aws:
+    if hf_buckets:
+        paths_url = _get_hf_manifest_path(crawl, file_type)
+    elif in_aws:
         paths_url = _get_s3_manifest_path(crawl, file_type)
     else:
         paths_url = _get_http_manifest_path(crawl, file_type)
 
     paths = _unique_cc_file_paths(paths_url, io_config)
 
-    if in_aws:
+    if hf_buckets:
+        paths = paths.select(format("hf://buckets/commoncrawl/commoncrawl/{}", col("url")).alias("url"))
+    elif in_aws:
         paths = paths.select(format("s3://commoncrawl/{}", col("url")).alias("url"))
     else:
         paths = paths.select(format("https://data.commoncrawl.org/{}", col("url")).alias("url"))
@@ -79,7 +88,8 @@ def common_crawl(
     num_files: int | None = None,
     io_config: IOConfig | None = None,
     *,
-    in_aws: bool,
+    in_aws: bool = False,
+    hf_buckets: bool = False,
 ) -> DataFrame:
     r"""Load Common Crawl data as a DataFrame.
 
@@ -94,11 +104,18 @@ def common_crawl(
             + "text" or "wet": Extracted plain text content
             + "metadata" or "wat": Metadata about crawled pages
         num_files: Limit the number of files to process. If not provided, processes all matching files.
-        io_config: IO configuration for accessing S3.
-        in_aws: Where to fetch the common crawl data from. If running in AWS, this must be set to True. If outside of AWS,
-                then this must be set to False. Setting this flag correctly is required for **optimal download performance**.
-                If running in AWS, then make sure you're in the "us-east-1" region so you don't incur S3 egress fees!
-                See [the Common Crawl docs](https://commoncrawl.org/get-started) for more specific instructions.
+        io_config: IO configuration for accessing storage.
+        in_aws: Fetch from AWS S3 (default: ``s3://commoncrawl/...\`). If running in AWS, set to ``True`` for optimal
+                performance. Set to ``False`` when running outside AWS to avoid S3 egress fees.
+                If running in AWS, make sure you're in the "us-east-1" region.
+        hf_buckets: Fetch from Hugging Face Buckets (default: ``hf://buckets/commoncrawl/...\`). This is the recommended
+                option for most users, especially when running outside AWS. HF Buckets are accessible from any cloud
+                provider and region, and are cheaper than S3 egress. See the [Hugging Face Buckets docs](https://huggingface.co/docs/huggingface_hub/guides/hf_file_system#hugging-face-buckets) for more details.
+
+    Note:
+        Only one of ``in_aws`` or ``hf_buckets`` should be ``True`` at a time. If both are ``False``,
+        HTTPS is used as a fallback (slower but requires no credentials).
+        See [the Common Crawl docs](https://commoncrawl.org/get-started) for more information.
 
     Returns:
         A DataFrame containing the requested Common Crawl data.
@@ -179,6 +196,7 @@ def common_crawl(
         num_files=num_files,
         io_config=io_config,
         in_aws=in_aws,
+        hf_buckets=hf_buckets,
     )
 
     return read_warc(warc_paths, io_config=io_config)
diff --git a/daft/filesystem.py b/daft/filesystem.py
@@ -271,6 +271,21 @@ def _set_if_not_none(kwargs: dict[str, Any], key: str, val: Any | None) -> None:
             fsspec_fs = fsspec_fs_cls()
         return pafs.PyFileSystem(pafs.FSSpecHandler(fsspec_fs)), None
 
+    ###
+    # HF (Hugging Face): Use FSSpec as a fallback
+    ###
+    if protocol == "hf":
+        fsspec_fs_cls = fsspec.get_filesystem_class("hf")
+        hf_kwargs: dict[str, Any] = {}
+        if io_config is not None and io_config.hf is not None:
+            hf_config = io_config.hf
+            if hf_config.token is not None:
+                hf_kwargs["token"] = hf_config.token.value
+            if hf_config.anonymous:
+                hf_kwargs["token"] = None
+        fsspec_fs = fsspec_fs_cls(**hf_kwargs)
+        return pafs.PyFileSystem(pafs.FSSpecHandler(fsspec_fs)), None
+
     ###
     # Gravitino GVFS: Use custom filesystem for write operations
     ###

diff --git a/docs/datasets/common-crawl.md b/docs/datasets/common-crawl.md
@@ -8,6 +8,90 @@ Daft provides a simple, performant, and responsible way to access Common Crawl d
 
     These APIs are in beta and may be subject to change as the Common Crawl dataset continues to be developed.
 
+## Access from Hugging Face Buckets
+
+As of 2025, Common Crawl data is also hosted on [Hugging Face Buckets](https://huggingface.co/buckets/commoncrawl), providing a cheaper and more accessible alternative to AWS S3. Unlike S3, which restricts access to AWS machines in the same region (and charges inter-region egress fees), HF Buckets are accessible from any cloud provider and region using the ``hf://`` protocol.
+
+When using `daft.datasets.common_crawl`, set ``hf_buckets=True`` to fetch data from HF Buckets. This is the **recommended** option for most users, especially when running outside AWS.
+
+### Reading with the Common Crawl dataset helper
+
+```python
+import daft
+
+# Read from HF Buckets (recommended)
+daft.datasets.common_crawl("CC-MAIN-2025-33", hf_buckets=True)
+```
+
+### Reading WARC files directly
+
+You can also read WARC files directly from HF Buckets using the ``hf://`` protocol:
+
+```python
+import daft
+
+# Read a single WARC file from HF Buckets
+daft.read_warc("hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/1775805908305.14/warc/CC-MAIN-20260410081153-20260410111153-00000.warc.gz")
+
+# Read WARC files matching a glob pattern
+daft.read_warc("hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/*/warc/*.warc.gz")
+```
+
+### Authentication
+
+For public Common Crawl data, no authentication is needed. For private Hugging Face repos or datasets, pass a token via ``IOConfig``:
+
+```python
+from daft.io import IOConfig
+
+daft.read_warc(
+    "hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2025-33/segments/...",
+    io_config=IOConfig(hf=daft.io.HuggingFaceConfig(token="hf_xxxxxxxxxxxxxxxxxxxx"))
+)
+```
+
+You can also authenticate using ``hf auth login`` — the token is picked up automatically.
+
+### Exploring HF Buckets
+
+Use the [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub) library to explore available data:
+
+```python
+from huggingface_hub import hffs
+
+# List crawl archives
+for path in hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/'):
+    print(path)
+
+# List segments in a specific crawl
+for path in hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/'):
+    print(path)
+
+# List WARC files in a segment
+for path in hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/1775805908305.14/warc/'):
+    print(path)
+```
+
+**Note:** When using ``hffs.ls()``, paths are returned without the ``hf://`` prefix. To open files returned by ``ls()``, prepend the protocol:
+
+```python
+files = hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/1775805908305.14/warc/')
+for file_path in files[:1]:
+    with daft.file.open_file('hf://' + file_path, 'rb') as fh:
+        # ... read the WARC file
+        ...
+```
+
+### Access Method Comparison
+
+| Method | URL Scheme | Best For | Credentials Required |
+|--------|-----------|----------|---------------------|
+| Hugging Face Buckets | ``hf://buckets/...`` | Cross-region, outside AWS, cheapest | No (public) |
+| AWS S3 | ``s3://commoncrawl/...`` | Inside AWS (us-east-1) | Yes |
+| HTTPS | ``https://data.commoncrawl.org/...`` | Fallback when no credentials | No |
+
+Common Crawl path structure on HF Buckets: ``hf://buckets/commoncrawl/commoncrawl/crawl-data/<crawl-name>/<path>``
+
 ## Prerequisites for access within the AWS Cloud
 
 Common Crawl data is hosted by [Amazon Web Services' Open Data Sets Sponsorships program](https://aws.amazon.com/opendata/) which makes it freely accessible.
@@ -19,6 +103,10 @@ All Common Crawl data is stored in the `us-east-1` region. It's recommended to a
 
 > The connection to S3 should be faster and you avoid the minimal fees for inter-region data transfer (you have to send requests which are charged as outgoing traffic).
 
+!!! warning
+
+    Using S3 from outside AWS incurs data transfer egress fees. Consider using [Hugging Face Buckets](#access-from-hugging-face-buckets) as a cheaper alternative.
+
 ### Authentication option 1: AWS SSO Login
 
 ```bash
@@ -49,18 +137,14 @@ daft.datasets.common_crawl("CC-MAIN-2025-33", io_config=io_config, in_aws=True)
 
 ## Prerequisites for access outside the AWS Cloud
 
-If you are running _outside_ of AWS, then the most optimal way to download Common Crawl data is to use their HTTPS links.
-From the [Common Crawl website](https://commoncrawl.org/get-started):
-
-> If you want to download the data to your local machine or local cluster, you can use any HTTP download agent, such as cURL or wget.
-
-**NOTE**: When using `daft.datasets.common_crawl`, you _must_ provide `in_aws=False` when accessing data outside the AWS Cloud!
+If you are running _outside_ of AWS, we recommend using [Hugging Face Buckets](#access-from-hugging-face-buckets) (`hf_buckets=True`) as the optimal access method.
 
-Here's an example of how to use Common Crawl with Daft when outside of AWS:
+As a fallback, you can also use HTTPS links (slower but no credentials required):
 
 ```python
 import daft
 
+# Use HTTPS as a fallback (slower than HF Buckets)
 daft.datasets.common_crawl("CC-MAIN-2025-33", in_aws=False)
 ```
 
@@ -71,14 +155,8 @@ The simplest way to get started with Common Crawl is to load a small sample of d
 ```python
 import daft
 
-# If you are running this code locally, set `in_aws = True`. This will use S3.
-# Otherwise, set `in_aws = False`. This will use HTTPS URLs for the files.
-# You must **explicitly** set the `in_aws` parameter.
-in_aws: bool = ...
-
-
-# Load a sample of raw WARC data from the CC-MAIN-2025-33 crawl
-daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, in_aws=in_aws).show()
+# Read from Hugging Face Buckets (recommended - free, fast from anywhere)
+daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, hf_buckets=True).show()
 ```
 
 ```{title="Output"}
@@ -117,27 +195,27 @@ Common Crawl provides three types of content:
 
 ```python
 # Raw WARC data (default)
-daft.datasets.common_crawl("CC-MAIN-2025-33", content="raw", in_aws=in_aws)
+daft.datasets.common_crawl("CC-MAIN-2025-33", content="raw", hf_buckets=True)
 # or equivalently
-daft.datasets.common_crawl("CC-MAIN-2025-33", content="warc", in_aws=in_aws)
+daft.datasets.common_crawl("CC-MAIN-2025-33", content="warc", hf_buckets=True)
 ```
 
 **Extracted text, aka WET files** - Plain text content extracted from web pages:
 
 ```python
 # Extracted text content
-daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", in_aws=in_aws)
+daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", hf_buckets=True)
 # or equivalently
-daft.datasets.common_crawl("CC-MAIN-2025-33", content="wet", in_aws=in_aws)
+daft.datasets.common_crawl("CC-MAIN-2025-33", content="wet", hf_buckets=True)
 ```
 
 **Metadata, aka WAT files** - Information about crawled pages without content:
 
 ```python
 # Metadata only
-daft.datasets.common_crawl("CC-MAIN-2025-33", content="metadata", in_aws=in_aws)
+daft.datasets.common_crawl("CC-MAIN-2025-33", content="metadata", hf_buckets=True)
 # or equivalently
-daft.datasets.common_crawl("CC-MAIN-2025-33", content="wat", in_aws=in_aws)
+daft.datasets.common_crawl("CC-MAIN-2025-33", content="wat", hf_buckets=True)
 ```
 
 ### Loading a subset of data
@@ -146,7 +224,7 @@ For quick testing and development, it's helpful to limit the number of crawl fil
 
 ```python
 # Process only 1 crawl file for testing
-daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, in_aws=in_aws)
+daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, hf_buckets=True)
 ```
 
 ### Working with specific segments
@@ -157,7 +235,7 @@ Each crawl is split into 100 segments. You can target a specific segment:
 daft.datasets.common_crawl(
     "CC-MAIN-2025-33",
     segment="1754151279521.11",
-    in_aws=in_aws,
+    hf_buckets=True,
 )
 ```
 
@@ -185,7 +263,7 @@ Find the most common MIME types in a crawl:
 
 ```python
 (
-    daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, in_aws=in_aws)
+    daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, hf_buckets=True)
     .select(daft.col("WARC-Identified-Payload-Type"))
     .groupby("WARC-Identified-Payload-Type")
     .agg(daft.col("WARC-Identified-Payload-Type").count().alias("count"))
@@ -226,7 +304,7 @@ Content in Common Crawl WARC files are UTF-8 encoded. Use Daft's [try_decode][da
 from daft.functions import try_decode
 
 (
-    daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", num_files=1, in_aws=in_aws)
+    daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", num_files=1, hf_buckets=True)
     .with_column("text_content", try_decode(daft.col("warc_content"), charset="utf-8"))
     .where(daft.col("text_content").not_null())
     .select("WARC-Target-URI", "text_content")