Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 26 additions & 8 deletions daft/datasets/common_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ def _get_http_manifest_path(crawl: str, file_type: Literal["warc", "wet", "wat"]
return f"https://data.commoncrawl.org/crawl-data/{crawl}/{file_type}.paths.gz"


def _get_hf_manifest_path(crawl: str, file_type: Literal["warc", "wet", "wat"]) -> str:
return f"hf://buckets/commoncrawl/commoncrawl/crawl-data/{crawl}/{file_type}.paths.gz"


def _unique_cc_file_paths(paths_url: str, io_config: IOConfig | None) -> DataFrame:
# The manifest file is a gzipped plaintext file with one path per line.
# Technically, this is equivalent to a CSV file with one column, "url", with no headers, and we could use read_csv.
Expand All @@ -47,16 +51,21 @@ def _get_common_crawl_paths(
io_config: IOConfig | None,
*,
in_aws: bool,
hf_buckets: bool = False,
) -> list[str]:
"""Get the paths to the Common Crawl files for a given crawl, segment, file type. Limited by `num_files`."""
if in_aws:
if hf_buckets:
paths_url = _get_hf_manifest_path(crawl, file_type)
elif in_aws:
paths_url = _get_s3_manifest_path(crawl, file_type)
else:
paths_url = _get_http_manifest_path(crawl, file_type)

paths = _unique_cc_file_paths(paths_url, io_config)

if in_aws:
if hf_buckets:
paths = paths.select(format("hf://buckets/commoncrawl/commoncrawl/{}", col("url")).alias("url"))
elif in_aws:
paths = paths.select(format("s3://commoncrawl/{}", col("url")).alias("url"))
else:
paths = paths.select(format("https://data.commoncrawl.org/{}", col("url")).alias("url"))
Expand All @@ -79,7 +88,8 @@ def common_crawl(
num_files: int | None = None,
io_config: IOConfig | None = None,
*,
in_aws: bool,
in_aws: bool = False,
hf_buckets: bool = False,
) -> DataFrame:
r"""Load Common Crawl data as a DataFrame.

Expand All @@ -94,11 +104,18 @@ def common_crawl(
+ "text" or "wet": Extracted plain text content
+ "metadata" or "wat": Metadata about crawled pages
num_files: Limit the number of files to process. If not provided, processes all matching files.
io_config: IO configuration for accessing S3.
in_aws: Where to fetch the common crawl data from. If running in AWS, this must be set to True. If outside of AWS,
then this must be set to False. Setting this flag correctly is required for **optimal download performance**.
If running in AWS, then make sure you're in the "us-east-1" region so you don't incur S3 egress fees!
See [the Common Crawl docs](https://commoncrawl.org/get-started) for more specific instructions.
io_config: IO configuration for accessing storage.
in_aws: Fetch from AWS S3 (default: ``s3://commoncrawl/...\`). If running in AWS, set to ``True`` for optimal
performance. Set to ``False`` when running outside AWS to avoid S3 egress fees.
If running in AWS, make sure you're in the "us-east-1" region.
hf_buckets: Fetch from Hugging Face Buckets (default: ``hf://buckets/commoncrawl/...\`). This is the recommended
option for most users, especially when running outside AWS. HF Buckets are accessible from any cloud
provider and region, and are cheaper than S3 egress. See the [Hugging Face Buckets docs](https://huggingface.co/docs/huggingface_hub/guides/hf_file_system#hugging-face-buckets) for more details.

Note:
Only one of ``in_aws`` or ``hf_buckets`` should be ``True`` at a time. If both are ``False``,
HTTPS is used as a fallback (slower but requires no credentials).
See [the Common Crawl docs](https://commoncrawl.org/get-started) for more information.

Returns:
A DataFrame containing the requested Common Crawl data.
Expand Down Expand Up @@ -179,6 +196,7 @@ def common_crawl(
num_files=num_files,
io_config=io_config,
in_aws=in_aws,
hf_buckets=hf_buckets,
)

return read_warc(warc_paths, io_config=io_config)
15 changes: 15 additions & 0 deletions daft/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,21 @@ def _set_if_not_none(kwargs: dict[str, Any], key: str, val: Any | None) -> None:
fsspec_fs = fsspec_fs_cls()
return pafs.PyFileSystem(pafs.FSSpecHandler(fsspec_fs)), None

###
# HF (Hugging Face): Use FSSpec as a fallback
###
if protocol == "hf":
fsspec_fs_cls = fsspec.get_filesystem_class("hf")
hf_kwargs: dict[str, Any] = {}
if io_config is not None and io_config.hf is not None:
hf_config = io_config.hf
if hf_config.token is not None:
hf_kwargs["token"] = hf_config.token.value
if hf_config.anonymous:
hf_kwargs["token"] = None
fsspec_fs = fsspec_fs_cls(**hf_kwargs)
return pafs.PyFileSystem(pafs.FSSpecHandler(fsspec_fs)), None

###
# Gravitino GVFS: Use custom filesystem for write operations
###
Expand Down
128 changes: 103 additions & 25 deletions docs/datasets/common-crawl.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,90 @@ Daft provides a simple, performant, and responsible way to access Common Crawl d

These APIs are in beta and may be subject to change as the Common Crawl dataset continues to be developed.

## Access from Hugging Face Buckets

As of 2025, Common Crawl data is also hosted on [Hugging Face Buckets](https://huggingface.co/buckets/commoncrawl), providing a cheaper and more accessible alternative to AWS S3. Unlike S3, which restricts access to AWS machines in the same region (and charges inter-region egress fees), HF Buckets are accessible from any cloud provider and region using the ``hf://`` protocol.

When using `daft.datasets.common_crawl`, set ``hf_buckets=True`` to fetch data from HF Buckets. This is the **recommended** option for most users, especially when running outside AWS.

### Reading with the Common Crawl dataset helper

```python
import daft

# Read from HF Buckets (recommended)
daft.datasets.common_crawl("CC-MAIN-2025-33", hf_buckets=True)
```

### Reading WARC files directly

You can also read WARC files directly from HF Buckets using the ``hf://`` protocol:

```python
import daft

# Read a single WARC file from HF Buckets
daft.read_warc("hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/1775805908305.14/warc/CC-MAIN-20260410081153-20260410111153-00000.warc.gz")

# Read WARC files matching a glob pattern
daft.read_warc("hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/*/warc/*.warc.gz")
```

### Authentication

For public Common Crawl data, no authentication is needed. For private Hugging Face repos or datasets, pass a token via ``IOConfig``:

```python
from daft.io import IOConfig

daft.read_warc(
"hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2025-33/segments/...",
io_config=IOConfig(hf=daft.io.HuggingFaceConfig(token="hf_xxxxxxxxxxxxxxxxxxxx"))
)
```

You can also authenticate using ``hf auth login`` — the token is picked up automatically.

### Exploring HF Buckets

Use the [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub) library to explore available data:

```python
from huggingface_hub import hffs

# List crawl archives
for path in hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/'):
print(path)

# List segments in a specific crawl
for path in hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/'):
print(path)

# List WARC files in a segment
for path in hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/1775805908305.14/warc/'):
print(path)
```

**Note:** When using ``hffs.ls()``, paths are returned without the ``hf://`` prefix. To open files returned by ``ls()``, prepend the protocol:

```python
files = hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/1775805908305.14/warc/')
for file_path in files[:1]:
with daft.file.open_file('hf://' + file_path, 'rb') as fh:
# ... read the WARC file
...
```

### Access Method Comparison

| Method | URL Scheme | Best For | Credentials Required |
|--------|-----------|----------|---------------------|
| Hugging Face Buckets | ``hf://buckets/...`` | Cross-region, outside AWS, cheapest | No (public) |
| AWS S3 | ``s3://commoncrawl/...`` | Inside AWS (us-east-1) | Yes |
| HTTPS | ``https://data.commoncrawl.org/...`` | Fallback when no credentials | No |

Common Crawl path structure on HF Buckets: ``hf://buckets/commoncrawl/commoncrawl/crawl-data/<crawl-name>/<path>``

## Prerequisites for access within the AWS Cloud

Common Crawl data is hosted by [Amazon Web Services' Open Data Sets Sponsorships program](https://aws.amazon.com/opendata/) which makes it freely accessible.
Expand All @@ -19,6 +103,10 @@ All Common Crawl data is stored in the `us-east-1` region. It's recommended to a

> The connection to S3 should be faster and you avoid the minimal fees for inter-region data transfer (you have to send requests which are charged as outgoing traffic).

!!! warning

Using S3 from outside AWS incurs data transfer egress fees. Consider using [Hugging Face Buckets](#access-from-hugging-face-buckets) as a cheaper alternative.

### Authentication option 1: AWS SSO Login

```bash
Expand Down Expand Up @@ -49,18 +137,14 @@ daft.datasets.common_crawl("CC-MAIN-2025-33", io_config=io_config, in_aws=True)

## Prerequisites for access outside the AWS Cloud

If you are running _outside_ of AWS, then the most optimal way to download Common Crawl data is to use their HTTPS links.
From the [Common Crawl website](https://commoncrawl.org/get-started):

> If you want to download the data to your local machine or local cluster, you can use any HTTP download agent, such as cURL or wget.

**NOTE**: When using `daft.datasets.common_crawl`, you _must_ provide `in_aws=False` when accessing data outside the AWS Cloud!
If you are running _outside_ of AWS, we recommend using [Hugging Face Buckets](#access-from-hugging-face-buckets) (`hf_buckets=True`) as the optimal access method.

Here's an example of how to use Common Crawl with Daft when outside of AWS:
As a fallback, you can also use HTTPS links (slower but no credentials required):

```python
import daft

# Use HTTPS as a fallback (slower than HF Buckets)
daft.datasets.common_crawl("CC-MAIN-2025-33", in_aws=False)
```

Expand All @@ -71,14 +155,8 @@ The simplest way to get started with Common Crawl is to load a small sample of d
```python
import daft

# If you are running this code locally, set `in_aws = True`. This will use S3.
# Otherwise, set `in_aws = False`. This will use HTTPS URLs for the files.
# You must **explicitly** set the `in_aws` parameter.
in_aws: bool = ...


# Load a sample of raw WARC data from the CC-MAIN-2025-33 crawl
daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, in_aws=in_aws).show()
# Read from Hugging Face Buckets (recommended - free, fast from anywhere)
daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, hf_buckets=True).show()
```

```{title="Output"}
Expand Down Expand Up @@ -117,27 +195,27 @@ Common Crawl provides three types of content:

```python
# Raw WARC data (default)
daft.datasets.common_crawl("CC-MAIN-2025-33", content="raw", in_aws=in_aws)
daft.datasets.common_crawl("CC-MAIN-2025-33", content="raw", hf_buckets=True)
# or equivalently
daft.datasets.common_crawl("CC-MAIN-2025-33", content="warc", in_aws=in_aws)
daft.datasets.common_crawl("CC-MAIN-2025-33", content="warc", hf_buckets=True)
```

**Extracted text, aka WET files** - Plain text content extracted from web pages:

```python
# Extracted text content
daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", in_aws=in_aws)
daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", hf_buckets=True)
# or equivalently
daft.datasets.common_crawl("CC-MAIN-2025-33", content="wet", in_aws=in_aws)
daft.datasets.common_crawl("CC-MAIN-2025-33", content="wet", hf_buckets=True)
```

**Metadata, aka WAT files** - Information about crawled pages without content:

```python
# Metadata only
daft.datasets.common_crawl("CC-MAIN-2025-33", content="metadata", in_aws=in_aws)
daft.datasets.common_crawl("CC-MAIN-2025-33", content="metadata", hf_buckets=True)
# or equivalently
daft.datasets.common_crawl("CC-MAIN-2025-33", content="wat", in_aws=in_aws)
daft.datasets.common_crawl("CC-MAIN-2025-33", content="wat", hf_buckets=True)
```

### Loading a subset of data
Expand All @@ -146,7 +224,7 @@ For quick testing and development, it's helpful to limit the number of crawl fil

```python
# Process only 1 crawl file for testing
daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, in_aws=in_aws)
daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, hf_buckets=True)
```

### Working with specific segments
Expand All @@ -157,7 +235,7 @@ Each crawl is split into 100 segments. You can target a specific segment:
daft.datasets.common_crawl(
"CC-MAIN-2025-33",
segment="1754151279521.11",
in_aws=in_aws,
hf_buckets=True,
)
```

Expand Down Expand Up @@ -185,7 +263,7 @@ Find the most common MIME types in a crawl:

```python
(
daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, in_aws=in_aws)
daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, hf_buckets=True)
.select(daft.col("WARC-Identified-Payload-Type"))
.groupby("WARC-Identified-Payload-Type")
.agg(daft.col("WARC-Identified-Payload-Type").count().alias("count"))
Expand Down Expand Up @@ -226,7 +304,7 @@ Content in Common Crawl WARC files are UTF-8 encoded. Use Daft's [try_decode][da
from daft.functions import try_decode

(
daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", num_files=1, in_aws=in_aws)
daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", num_files=1, hf_buckets=True)
.with_column("text_content", try_decode(daft.col("warc_content"), charset="utf-8"))
.where(daft.col("text_content").not_null())
.select("WARC-Target-URI", "text_content")
Expand Down
Loading