diff --git a/daft/datasets/common_crawl.py b/daft/datasets/common_crawl.py index 96fe88b0e92..210613f7aa8 100644 --- a/daft/datasets/common_crawl.py +++ b/daft/datasets/common_crawl.py @@ -24,6 +24,10 @@ def _get_http_manifest_path(crawl: str, file_type: Literal["warc", "wet", "wat"] return f"https://data.commoncrawl.org/crawl-data/{crawl}/{file_type}.paths.gz" +def _get_hf_manifest_path(crawl: str, file_type: Literal["warc", "wet", "wat"]) -> str: + return f"hf://buckets/commoncrawl/commoncrawl/crawl-data/{crawl}/{file_type}.paths.gz" + + def _unique_cc_file_paths(paths_url: str, io_config: IOConfig | None) -> DataFrame: # The manifest file is a gzipped plaintext file with one path per line. # Technically, this is equivalent to a CSV file with one column, "url", with no headers, and we could use read_csv. @@ -47,16 +51,21 @@ def _get_common_crawl_paths( io_config: IOConfig | None, *, in_aws: bool, + hf_buckets: bool = False, ) -> list[str]: """Get the paths to the Common Crawl files for a given crawl, segment, file type. Limited by `num_files`.""" - if in_aws: + if hf_buckets: + paths_url = _get_hf_manifest_path(crawl, file_type) + elif in_aws: paths_url = _get_s3_manifest_path(crawl, file_type) else: paths_url = _get_http_manifest_path(crawl, file_type) paths = _unique_cc_file_paths(paths_url, io_config) - if in_aws: + if hf_buckets: + paths = paths.select(format("hf://buckets/commoncrawl/commoncrawl/{}", col("url")).alias("url")) + elif in_aws: paths = paths.select(format("s3://commoncrawl/{}", col("url")).alias("url")) else: paths = paths.select(format("https://data.commoncrawl.org/{}", col("url")).alias("url")) @@ -79,7 +88,8 @@ def common_crawl( num_files: int | None = None, io_config: IOConfig | None = None, *, - in_aws: bool, + in_aws: bool = False, + hf_buckets: bool = False, ) -> DataFrame: r"""Load Common Crawl data as a DataFrame. @@ -94,11 +104,18 @@ def common_crawl( + "text" or "wet": Extracted plain text content + "metadata" or "wat": Metadata about crawled pages num_files: Limit the number of files to process. If not provided, processes all matching files. - io_config: IO configuration for accessing S3. - in_aws: Where to fetch the common crawl data from. If running in AWS, this must be set to True. If outside of AWS, - then this must be set to False. Setting this flag correctly is required for **optimal download performance**. - If running in AWS, then make sure you're in the "us-east-1" region so you don't incur S3 egress fees! - See [the Common Crawl docs](https://commoncrawl.org/get-started) for more specific instructions. + io_config: IO configuration for accessing storage. + in_aws: Fetch from AWS S3 (default: ``s3://commoncrawl/...\`). If running in AWS, set to ``True`` for optimal + performance. Set to ``False`` when running outside AWS to avoid S3 egress fees. + If running in AWS, make sure you're in the "us-east-1" region. + hf_buckets: Fetch from Hugging Face Buckets (default: ``hf://buckets/commoncrawl/...\`). This is the recommended + option for most users, especially when running outside AWS. HF Buckets are accessible from any cloud + provider and region, and are cheaper than S3 egress. See the [Hugging Face Buckets docs](https://huggingface.co/docs/huggingface_hub/guides/hf_file_system#hugging-face-buckets) for more details. + + Note: + Only one of ``in_aws`` or ``hf_buckets`` should be ``True`` at a time. If both are ``False``, + HTTPS is used as a fallback (slower but requires no credentials). + See [the Common Crawl docs](https://commoncrawl.org/get-started) for more information. Returns: A DataFrame containing the requested Common Crawl data. @@ -179,6 +196,7 @@ def common_crawl( num_files=num_files, io_config=io_config, in_aws=in_aws, + hf_buckets=hf_buckets, ) return read_warc(warc_paths, io_config=io_config) diff --git a/daft/filesystem.py b/daft/filesystem.py index 5d4cd4d14ef..b69f8fe267e 100644 --- a/daft/filesystem.py +++ b/daft/filesystem.py @@ -271,6 +271,21 @@ def _set_if_not_none(kwargs: dict[str, Any], key: str, val: Any | None) -> None: fsspec_fs = fsspec_fs_cls() return pafs.PyFileSystem(pafs.FSSpecHandler(fsspec_fs)), None + ### + # HF (Hugging Face): Use FSSpec as a fallback + ### + if protocol == "hf": + fsspec_fs_cls = fsspec.get_filesystem_class("hf") + hf_kwargs: dict[str, Any] = {} + if io_config is not None and io_config.hf is not None: + hf_config = io_config.hf + if hf_config.token is not None: + hf_kwargs["token"] = hf_config.token.value + if hf_config.anonymous: + hf_kwargs["token"] = None + fsspec_fs = fsspec_fs_cls(**hf_kwargs) + return pafs.PyFileSystem(pafs.FSSpecHandler(fsspec_fs)), None + ### # Gravitino GVFS: Use custom filesystem for write operations ### diff --git a/docs/datasets/common-crawl.md b/docs/datasets/common-crawl.md index 9c81d2da442..ec9b770144f 100644 --- a/docs/datasets/common-crawl.md +++ b/docs/datasets/common-crawl.md @@ -8,6 +8,90 @@ Daft provides a simple, performant, and responsible way to access Common Crawl d These APIs are in beta and may be subject to change as the Common Crawl dataset continues to be developed. +## Access from Hugging Face Buckets + +As of 2025, Common Crawl data is also hosted on [Hugging Face Buckets](https://huggingface.co/buckets/commoncrawl), providing a cheaper and more accessible alternative to AWS S3. Unlike S3, which restricts access to AWS machines in the same region (and charges inter-region egress fees), HF Buckets are accessible from any cloud provider and region using the ``hf://`` protocol. + +When using `daft.datasets.common_crawl`, set ``hf_buckets=True`` to fetch data from HF Buckets. This is the **recommended** option for most users, especially when running outside AWS. + +### Reading with the Common Crawl dataset helper + +```python +import daft + +# Read from HF Buckets (recommended) +daft.datasets.common_crawl("CC-MAIN-2025-33", hf_buckets=True) +``` + +### Reading WARC files directly + +You can also read WARC files directly from HF Buckets using the ``hf://`` protocol: + +```python +import daft + +# Read a single WARC file from HF Buckets +daft.read_warc("hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/1775805908305.14/warc/CC-MAIN-20260410081153-20260410111153-00000.warc.gz") + +# Read WARC files matching a glob pattern +daft.read_warc("hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/*/warc/*.warc.gz") +``` + +### Authentication + +For public Common Crawl data, no authentication is needed. For private Hugging Face repos or datasets, pass a token via ``IOConfig``: + +```python +from daft.io import IOConfig + +daft.read_warc( + "hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2025-33/segments/...", + io_config=IOConfig(hf=daft.io.HuggingFaceConfig(token="hf_xxxxxxxxxxxxxxxxxxxx")) +) +``` + +You can also authenticate using ``hf auth login`` — the token is picked up automatically. + +### Exploring HF Buckets + +Use the [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub) library to explore available data: + +```python +from huggingface_hub import hffs + +# List crawl archives +for path in hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/'): + print(path) + +# List segments in a specific crawl +for path in hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/'): + print(path) + +# List WARC files in a segment +for path in hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/1775805908305.14/warc/'): + print(path) +``` + +**Note:** When using ``hffs.ls()``, paths are returned without the ``hf://`` prefix. To open files returned by ``ls()``, prepend the protocol: + +```python +files = hffs.ls('hf://buckets/commoncrawl/commoncrawl/crawl-data/CC-MAIN-2026-17/segments/1775805908305.14/warc/') +for file_path in files[:1]: + with daft.file.open_file('hf://' + file_path, 'rb') as fh: + # ... read the WARC file + ... +``` + +### Access Method Comparison + +| Method | URL Scheme | Best For | Credentials Required | +|--------|-----------|----------|---------------------| +| Hugging Face Buckets | ``hf://buckets/...`` | Cross-region, outside AWS, cheapest | No (public) | +| AWS S3 | ``s3://commoncrawl/...`` | Inside AWS (us-east-1) | Yes | +| HTTPS | ``https://data.commoncrawl.org/...`` | Fallback when no credentials | No | + +Common Crawl path structure on HF Buckets: ``hf://buckets/commoncrawl/commoncrawl/crawl-data//`` + ## Prerequisites for access within the AWS Cloud Common Crawl data is hosted by [Amazon Web Services' Open Data Sets Sponsorships program](https://aws.amazon.com/opendata/) which makes it freely accessible. @@ -19,6 +103,10 @@ All Common Crawl data is stored in the `us-east-1` region. It's recommended to a > The connection to S3 should be faster and you avoid the minimal fees for inter-region data transfer (you have to send requests which are charged as outgoing traffic). +!!! warning + + Using S3 from outside AWS incurs data transfer egress fees. Consider using [Hugging Face Buckets](#access-from-hugging-face-buckets) as a cheaper alternative. + ### Authentication option 1: AWS SSO Login ```bash @@ -49,18 +137,14 @@ daft.datasets.common_crawl("CC-MAIN-2025-33", io_config=io_config, in_aws=True) ## Prerequisites for access outside the AWS Cloud -If you are running _outside_ of AWS, then the most optimal way to download Common Crawl data is to use their HTTPS links. -From the [Common Crawl website](https://commoncrawl.org/get-started): - -> If you want to download the data to your local machine or local cluster, you can use any HTTP download agent, such as cURL or wget. - -**NOTE**: When using `daft.datasets.common_crawl`, you _must_ provide `in_aws=False` when accessing data outside the AWS Cloud! +If you are running _outside_ of AWS, we recommend using [Hugging Face Buckets](#access-from-hugging-face-buckets) (`hf_buckets=True`) as the optimal access method. -Here's an example of how to use Common Crawl with Daft when outside of AWS: +As a fallback, you can also use HTTPS links (slower but no credentials required): ```python import daft +# Use HTTPS as a fallback (slower than HF Buckets) daft.datasets.common_crawl("CC-MAIN-2025-33", in_aws=False) ``` @@ -71,14 +155,8 @@ The simplest way to get started with Common Crawl is to load a small sample of d ```python import daft -# If you are running this code locally, set `in_aws = True`. This will use S3. -# Otherwise, set `in_aws = False`. This will use HTTPS URLs for the files. -# You must **explicitly** set the `in_aws` parameter. -in_aws: bool = ... - - -# Load a sample of raw WARC data from the CC-MAIN-2025-33 crawl -daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, in_aws=in_aws).show() +# Read from Hugging Face Buckets (recommended - free, fast from anywhere) +daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, hf_buckets=True).show() ``` ```{title="Output"} @@ -117,27 +195,27 @@ Common Crawl provides three types of content: ```python # Raw WARC data (default) -daft.datasets.common_crawl("CC-MAIN-2025-33", content="raw", in_aws=in_aws) +daft.datasets.common_crawl("CC-MAIN-2025-33", content="raw", hf_buckets=True) # or equivalently -daft.datasets.common_crawl("CC-MAIN-2025-33", content="warc", in_aws=in_aws) +daft.datasets.common_crawl("CC-MAIN-2025-33", content="warc", hf_buckets=True) ``` **Extracted text, aka WET files** - Plain text content extracted from web pages: ```python # Extracted text content -daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", in_aws=in_aws) +daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", hf_buckets=True) # or equivalently -daft.datasets.common_crawl("CC-MAIN-2025-33", content="wet", in_aws=in_aws) +daft.datasets.common_crawl("CC-MAIN-2025-33", content="wet", hf_buckets=True) ``` **Metadata, aka WAT files** - Information about crawled pages without content: ```python # Metadata only -daft.datasets.common_crawl("CC-MAIN-2025-33", content="metadata", in_aws=in_aws) +daft.datasets.common_crawl("CC-MAIN-2025-33", content="metadata", hf_buckets=True) # or equivalently -daft.datasets.common_crawl("CC-MAIN-2025-33", content="wat", in_aws=in_aws) +daft.datasets.common_crawl("CC-MAIN-2025-33", content="wat", hf_buckets=True) ``` ### Loading a subset of data @@ -146,7 +224,7 @@ For quick testing and development, it's helpful to limit the number of crawl fil ```python # Process only 1 crawl file for testing -daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, in_aws=in_aws) +daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, hf_buckets=True) ``` ### Working with specific segments @@ -157,7 +235,7 @@ Each crawl is split into 100 segments. You can target a specific segment: daft.datasets.common_crawl( "CC-MAIN-2025-33", segment="1754151279521.11", - in_aws=in_aws, + hf_buckets=True, ) ``` @@ -185,7 +263,7 @@ Find the most common MIME types in a crawl: ```python ( - daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, in_aws=in_aws) + daft.datasets.common_crawl("CC-MAIN-2025-33", num_files=1, hf_buckets=True) .select(daft.col("WARC-Identified-Payload-Type")) .groupby("WARC-Identified-Payload-Type") .agg(daft.col("WARC-Identified-Payload-Type").count().alias("count")) @@ -226,7 +304,7 @@ Content in Common Crawl WARC files are UTF-8 encoded. Use Daft's [try_decode][da from daft.functions import try_decode ( - daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", num_files=1, in_aws=in_aws) + daft.datasets.common_crawl("CC-MAIN-2025-33", content="text", num_files=1, hf_buckets=True) .with_column("text_content", try_decode(daft.col("warc_content"), charset="utf-8")) .where(daft.col("text_content").not_null()) .select("WARC-Target-URI", "text_content")